In [28]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
import torchvision.datasets as datasets

In [29]:
import pandas as pd

df = pd.read_csv('selected_features_training.csv')

# Step 3: Preprocess the Data

# Separate features and target variable
X = df.drop('label', axis=1)
y = df['label']

In [30]:

import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [31]:
# Convert DataFrame to NumPy and then to PyTorch tensor
X_train = torch.tensor(X_train.values, dtype=torch.float32)
y_train = torch.tensor(y_train.values, dtype=torch.long)
X_test = torch.tensor(X_test.values, dtype=torch.float32)
y_test = torch.tensor(y_test.values, dtype=torch.long)


In [32]:
X_train

tensor([[0.5000, 0.3478, 0.0000,  ..., 0.0600, 0.0000, 0.0000],
        [0.5000, 0.7101, 0.0000,  ..., 0.0000, 1.0000, 1.0000],
        [0.5000, 0.3478, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        ...,
        [0.5000, 0.3478, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.5000, 0.3478, 0.0000,  ..., 0.0500, 0.0000, 0.0000],
        [1.0000, 0.1739, 0.0000,  ..., 0.0000, 0.0000, 0.0000]])

In [33]:
print(X_train.shape)  # Expected: (num_samples, num_features)

torch.Size([100778, 20])


In [34]:
from tensorflow.keras.models import load_model
teacher_model = load_model("pnn_model.h5")




In [35]:
teacher_logits = torch.tensor(teacher_model.predict(X_train), dtype=torch.float32)
teacher_logits

[1m3150/3150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 1ms/step


tensor([[2.7812e-09, 3.4620e-20, 1.2752e-25,  ..., 1.3662e-35, 1.1516e-13,
         0.0000e+00],
        [0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        [9.3335e-25, 2.1447e-31, 0.0000e+00,  ..., 5.4336e-24, 0.0000e+00,
         0.0000e+00],
        ...,
        [7.8920e-25, 6.2439e-31, 0.0000e+00,  ..., 2.9868e-23, 0.0000e+00,
         0.0000e+00],
        [1.4533e-06, 1.7698e-14, 2.0384e-19,  ..., 4.0989e-29, 2.1206e-11,
         2.5264e-30],
        [5.3072e-28, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00]])

In [36]:
import torch.nn as nn

class StudentModel(nn.Module):
    def __init__(self):
        super(StudentModel, self).__init__()
        self.fc1 = nn.Linear(X_train.shape[1], 64)  # Input layer to a smaller hidden layer
        self.fc2 = nn.Linear(64, len(torch.unique(y_train)))  # Hidden layer to output layer

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return x


In [37]:
student_model = StudentModel()

In [38]:
student_model

StudentModel(
  (fc1): Linear(in_features=20, out_features=64, bias=True)
  (fc2): Linear(in_features=64, out_features=23, bias=True)
)

In [39]:
alpha = 0.5         # Weight for soft vs hard loss
temperature = 3.0   # Temperature for softening the teacher's outputs
learning_rate = 0.001  # Learning rate for the optimizer
num_epochs = 100     # Number of training epochs

In [40]:
optimizer = torch.optim.Adam(student_model.parameters(), lr=learning_rate)


In [41]:
def distillation_loss(student_logits, teacher_logits, true_labels, alpha=0.5, temperature=3.0):
    # Compute teacher probabilities (softened outputs)
    teacher_probs = torch.softmax(teacher_logits / temperature, dim=1)
    # Compute student probabilities
    student_probs = torch.log_softmax(student_logits / temperature, dim=1)

    # Compute the soft loss (KL Divergence between student and teacher)
    soft_loss = nn.KLDivLoss(reduction="batchmean")(student_probs, teacher_probs)

    # Compute the hard loss (CrossEntropy with true labels)
    hard_loss = nn.CrossEntropyLoss()(student_logits, true_labels)

    # Combine soft and hard losses
    return alpha * soft_loss + (1 - alpha) * hard_loss


In [None]:
for epoch in range(num_epochs):
    student_model.train()
    optimizer.zero_grad()

    # Get student's predictions
    student_logits = student_model(X_train)

    # Calculate the distillation loss
    loss = distillation_loss(student_logits, teacher_logits, y_train, alpha, temperature)

    # Backpropagation
    loss.backward()
    optimizer.step()

    # Print progress every 10 epochs
    if (epoch + 1) % 10 == 0:
        print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item():.4f}")


In [None]:
student_model.eval()
with torch.no_grad():
    test_logits = student_model(X_test)
    predictions = torch.argmax(test_logits, dim=1)

# Calculate accuracy
accuracy = accuracy_score(y_test.numpy(), predictions.numpy())
print(f"Student Model Accuracy: {accuracy:.4f}")


Student Model Accuracy: 0.9140


In [20]:
torch.save(student_model.state_dict(), "student_model.h5")


In [25]:
teacher_size = teacher_model.count_params()
student_size = sum(p.numel() for p in student_model.parameters())

print(f"Teacher Model Size: {teacher_size} parameters")
print(f"Student Model Size: {student_size} parameters")


Teacher Model Size: 88599 parameters
Student Model Size: 2839 parameters


In [44]:
print(f'Redcution by {(1- (student_size/teacher_size)) *100} %')

Redcution by 96.7956748947505 %
