<a href="https://colab.research.google.com/github/nehaa56789/neural_symbolic_learner_model/blob/main/Transformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Purely Neural - TRANSFORMER

##Dataset Preprocessing

In [56]:
import pandas as pd
import numpy as np
import torch.nn as nn
from sklearn.preprocessing import StandardScaler, LabelEncoder
from torch.utils.data import Dataset, DataLoader
import torch
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, roc_auc_score
import pickle

In [57]:
# ----------------------
# 1. Load & clean
# ----------------------
df = pd.read_csv("train_data.csv")

In [58]:
# Sort per student chronologically
df = df.sort_values(["student_id", "start_time"]).reset_index(drop=True)

In [59]:
# Convert correct column to boolean (fill NA with False)
df["correct"] = df["correct"].fillna(False).astype(bool)

  df["correct"] = df["correct"].fillna(False).astype(bool)


In [60]:
# ----------------------
# 2. Handle missing values
# ----------------------
num_cols = df.select_dtypes(include=["float64", "int64"]).columns
cat_cols = df.select_dtypes(include=["object", "bool"]).columns

# Fill numeric with median
for col in num_cols:
    if df[col].isnull().any():
        df[col] = df[col].fillna(df[col].median())

# Fill categorical with "unknown"
for col in cat_cols:
    if df[col].isnull().any():
        df[col] = df[col].fillna("unknown")

In [61]:
# ----------------------
# 3. Encode categoricals
# ----------------------
label_encoders = {}
for col in ["problem_type", "content_source", "skills", "tutoring_types","answer_before_tutoring","account_creation_date"]:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))
    label_encoders[col] = le

# Save encoders
with open("label_encoders.pkl", "wb") as f:
    pickle.dump(label_encoders, f)


In [62]:
# ----------------------
# 4. Scale numeric features
# ----------------------
numeric_features = [
    "time_on_task",
    "fraction_of_hints_used",
    "attempt_count",
    "student_answer_count",
    "mean_correct",
    "mean_time_on_task",
    "started_problem_sets_count",
    "completed_problem_sets_count",
    "started_skill_builders_count",
    "mastered_skill_builders_count",
    "answered_problems_count",
    "mean_problem_correctness",
    "mean_problem_time_on_task",
    "mean_class_score"
]

scaler = StandardScaler()
df[numeric_features] = scaler.fit_transform(df[numeric_features])
# Save the fitted scaler
with open("scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)

In [63]:
df.groupby("student_id").size().describe()


Unnamed: 0,0
count,10000.0
mean,11.0474
std,4.59929
min,5.0
25%,7.0
50%,10.0
75%,15.0
max,20.0


In [64]:
print(df.dtypes)


log_id                             int64
student_id                         int64
assignment_id                      int64
problem_id                         int64
start_time                        object
time_on_task                     float64
answer_before_tutoring             int64
fraction_of_hints_used           float64
attempt_count                    float64
answer_given                        bool
problem_completed                   bool
correct                             bool
next_correct                        bool
content_source                     int64
skills                             int64
problem_type                       int64
tutoring_types                     int64
student_answer_count             float64
mean_correct                     float64
mean_time_on_task                float64
class_id                           int64
account_creation_date              int64
started_problem_sets_count       float64
completed_problem_sets_count     float64
started_skill_bu

In [65]:
for col in df.select_dtypes(include="bool").columns:
    df[col] = df[col].astype(int)

In [66]:
df.to_csv("transformer_train_data.csv", index=False)


In [69]:
# ----------------------
# 5. Sequence dataset for Transformer
# ----------------------
class StudentSequenceDataset(Dataset):
    def __init__(self, df, context=12, target_col="next_correct"):
        self.context = context
        self.target_col = target_col

        # Group per student
        self.groups = []
        for sid, group in df.groupby("student_id"):
            features = group.drop(columns=["start_time","log_id", "student_id", "account_creation_date"]).values

            targets = group[target_col].astype(int).values

            if len(group) < context:
                # Pad sequences with zeros
                pad_len = context - len(group)
                padded_x = np.vstack([np.zeros((pad_len, features.shape[1])), features])
                padded_y = targets[-1]  # last available target
                self.groups.append((padded_x, padded_y))
            else:
                for i in range(len(group) - context):
                    x = features[i:i+context]
                    y = targets[i+context]
                    self.groups.append((x, y))

    def __len__(self):
        return len(self.groups)

    def __getitem__(self, idx):
        x, y = self.groups[idx]
        return torch.tensor(x, dtype=torch.float32), torch.tensor(y, dtype=torch.float32)

# Example usage
dataset = StudentSequenceDataset(df, context=12, target_col="next_correct")
print("Dataset size:", len(dataset))
print("One sample X shape:", dataset[0][0].shape, "Target:", dataset[0][1])

Dataset size: 21430
One sample X shape: torch.Size([12, 26]) Target: tensor(1.)


## Training transformer


In [68]:
# import torch
# import torch.nn as nn
# from torch.utils.data import Dataset, DataLoader, random_split
# import numpy as np
# import pandas as pd
# from sklearn.metrics import accuracy_score, roc_auc_score


# # ----------------------
# # 1. Dataset
# # ----------------------
# class SAKTDataset(Dataset):
#     def __init__(self, df, context=12, target_col="next_correct"):
#         self.context = context
#         self.samples = []

#         # Convert booleans to ints
#         for col in df.select_dtypes(include="bool").columns:
#             df[col] = df[col].astype(int)

#         # Group per student
#         for sid, group in df.groupby("student_id"):
#             skills = group["skills"].values
#             correct = group["correct"].values
#             targets = group[target_col].values

#             if len(group) < context:
#                 pad_len = context - len(group)
#                 skills = np.concatenate([np.zeros(pad_len, dtype=int), skills])
#                 correct = np.concatenate([np.zeros(pad_len, dtype=int), correct])
#                 y = targets[-1]
#                 self.samples.append((skills, correct, y))
#             else:
#                 for i in range(len(group) - context):
#                     x_skills = skills[i:i+context]
#                     x_correct = correct[i:i+context]
#                     y = targets[i+context]
#                     self.samples.append((x_skills, x_correct, y))

#     def __len__(self):
#         return len(self.samples)

#     def __getitem__(self, idx):
#         skills, correct, y = self.samples[idx]
#         return (
#             torch.tensor(skills, dtype=torch.long),
#             torch.tensor(correct, dtype=torch.long),
#             torch.tensor(y, dtype=torch.float32),
#         )


# # ----------------------
# # 2. Model
# # ----------------------
# class SAKT(nn.Module):
#     def __init__(self, num_skills, d_model=32, n_heads=4, num_layers=2):
#         super().__init__()
#         self.skill_embed = nn.Embedding(num_skills + 1, d_model)
#         self.correct_embed = nn.Embedding(2, d_model)

#         encoder_layer = nn.TransformerEncoderLayer(
#             d_model=d_model, nhead=n_heads, batch_first=True
#         )
#         self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)

#         self.fc = nn.Linear(d_model, 1)

#     def forward(self, skills, correct):
#         skill_emb = self.skill_embed(skills)
#         correct_emb = self.correct_embed(correct)
#         x = skill_emb + correct_emb
#         x = self.encoder(x)
#         x = x[:, -1, :]  # last time step
#         out = torch.sigmoid(self.fc(x))
#         return out.squeeze(-1)  # keep batch dim


# # ----------------------
# # 3. Training with Validation
# # ----------------------
# def train_model(df, num_epochs=10, batch_size=32, lr=1e-3, context=12, val_split=0.2):
#     # Dataset
#     dataset = SAKTDataset(df, context=context, target_col="next_correct")
#     val_size = int(len(dataset) * val_split)
#     train_size = len(dataset) - val_size
#     train_ds, val_ds = random_split(dataset, [train_size, val_size])

#     train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
#     val_loader = DataLoader(val_ds, batch_size=batch_size)

#     # Model
#     num_skills = int(df["skills"].max()) + 1
#     model = SAKT(num_skills=num_skills, d_model=64, n_heads=4, num_layers=2)

#     criterion = nn.BCELoss()
#     optimizer = torch.optim.Adam(model.parameters(), lr=lr)

#     # Train
#     for epoch in range(num_epochs):
#         model.train()
#         total_loss = 0
#         for skills, correct, y in train_loader:
#             optimizer.zero_grad()
#             preds = model(skills, correct)
#             loss = criterion(preds, y)
#             loss.backward()
#             optimizer.step()
#             total_loss += loss.item() * len(y)

#         avg_train_loss = total_loss / train_size

#         # Validation
#         model.eval()
#         all_preds, all_targets = [], []
#         with torch.no_grad():
#             for skills, correct, y in val_loader:
#                 preds = model(skills, correct)
#                 all_preds.extend(preds.cpu().numpy())
#                 all_targets.extend(y.cpu().numpy())

#         val_loss = criterion(torch.tensor(all_preds), torch.tensor(all_targets)).item()
#         val_acc = accuracy_score(all_targets, np.round(all_preds))
#         try:
#             val_auc = roc_auc_score(all_targets, all_preds)
#         except ValueError:
#             val_auc = float("nan")  # if only 1 class in val targets

#         print(
#             f"Epoch {epoch+1}/{num_epochs} "
#             f"- Train Loss: {avg_train_loss:.4f} "
#             f"- Val Loss: {val_loss:.4f} "
#             f"- Val Acc: {val_acc:.4f} "
#             f"- Val AUC: {val_auc:.4f}"
#         )

#     return model


In [70]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd

# ----------------------
# 1. Dataset
# ----------------------
class SAKTDatasetAllFeatures(Dataset):
    def __init__(self, df, numeric_cols, categorical_cols, context=12, target_col="next_correct"):
        self.context = context
        self.samples = []
        self.numeric_cols = numeric_cols
        self.categorical_cols = categorical_cols

        # Convert boolean columns to int
        for col in df.select_dtypes(include="bool").columns:
            df[col] = df[col].astype(int)

        # Store categorical mappings (int IDs)
        self.cat_maps = {col: sorted(df[col].unique()) for col in categorical_cols}
        self.cat_to_idx = {
            col: {cat: i for i, cat in enumerate(self.cat_maps[col])} for col in categorical_cols
        }

        # Group per student
        for sid, group in df.groupby("student_id"):
            # Convert categorical to IDs
            cat_data = []
            for col in categorical_cols:
                cat_data.append(np.array([self.cat_to_idx[col][v] for v in group[col].values]))
            cat_data = np.stack(cat_data, axis=1)  # shape: [seq_len, num_cats]

            # Numeric features
            num_data = group[numeric_cols].values.astype(np.float32)

            # Target
            targets = group[target_col].values

            # Pad sequences shorter than context
            if len(group) < context:
                pad_len = context - len(group)
                cat_data = np.concatenate([np.zeros((pad_len, len(categorical_cols)), dtype=int), cat_data])
                num_data = np.concatenate([np.zeros((pad_len, len(numeric_cols)), dtype=np.float32), num_data])
                y = targets[-1]
                self.samples.append((num_data, cat_data, y))
            else:
                for i in range(len(group) - context):
                    x_num = num_data[i:i+context]
                    x_cat = cat_data[i:i+context]
                    y = targets[i+context]
                    self.samples.append((x_num, x_cat, y))

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        num_data, cat_data, y = self.samples[idx]
        return (
            torch.tensor(num_data, dtype=torch.float32),
            torch.tensor(cat_data, dtype=torch.long),
            torch.tensor(y, dtype=torch.float32)
        )

# ----------------------
# 2. Transformer model
# ----------------------
class SAKTAllFeatures(nn.Module):
    def __init__(self, numeric_dim, categorical_vocab_sizes, d_model=64, n_heads=4, num_layers=2):
        super().__init__()
        self.d_model = d_model

        # Embeddings for categorical features
        self.cat_embeddings = nn.ModuleList([
            nn.Embedding(vocab_size, d_model) for vocab_size in categorical_vocab_sizes.values()
        ])

        # Linear projection for numeric features
        self.num_linear = nn.Linear(numeric_dim, d_model)

        # Transformer encoder
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model, nhead=n_heads, batch_first=True
        )
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)

        # Output layer
        self.fc = nn.Linear(d_model, 1)

    def forward(self, numeric, categorical):
        # Embed categorical features
        cat_embeds = []
        for i, emb_layer in enumerate(self.cat_embeddings):
            cat_embeds.append(emb_layer(categorical[:, :, i]))
        cat_embeds = torch.stack(cat_embeds, dim=0).sum(dim=0)  # sum over categories

        # Project numeric features
        num_proj = self.num_linear(numeric)

        # Combine numeric + categorical
        x = num_proj + cat_embeds

        # Transformer encoder
        x = self.encoder(x)
        x = x[:, -1, :]  # last time step

        # Output
        out = torch.sigmoid(self.fc(x))
        return out.squeeze(-1)

# ----------------------
# 3. Training with Validation
# ----------------------
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, random_split
import numpy as np
from sklearn.metrics import accuracy_score, roc_auc_score

def train_model_all_features(
    df,
    numeric_cols,
    categorical_cols,
    num_epochs=10,
    batch_size=32,
    lr=1e-3,
    context=12,
    val_split=0.2
):
    """
    Train Transformer model (SAKTAllFeatures) using both numeric and categorical features.
    """

    # ----------------------
    # Dataset
    # ----------------------
    dataset = SAKTDatasetAllFeatures(df, numeric_cols, categorical_cols, context=context, target_col="next_correct")
    val_size = int(len(dataset) * val_split)
    train_size = len(dataset) - val_size
    train_ds, val_ds = random_split(dataset, [train_size, val_size])

    train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_ds, batch_size=batch_size)

    # ----------------------
    # Model
    # ----------------------
    categorical_vocab_sizes = {col: df[col].nunique() for col in categorical_cols}
    model = SAKTAllFeatures(
        numeric_dim=len(numeric_cols),
        categorical_vocab_sizes=categorical_vocab_sizes,
        d_model=64,
        n_heads=4,
        num_layers=2
    )

    criterion = nn.BCELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    # ----------------------
    # Training loop
    # ----------------------
    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        for num_data, cat_data, y in train_loader:
            optimizer.zero_grad()
            preds = model(num_data, cat_data)
            loss = criterion(preds, y)
            loss.backward()
            optimizer.step()
            total_loss += loss.item() * len(y)

        avg_train_loss = total_loss / train_size

        # ----------------------
        # Validation
        # ----------------------
        model.eval()
        all_preds, all_targets = [], []
        with torch.no_grad():
            for num_data, cat_data, y in val_loader:
                preds = model(num_data, cat_data)
                all_preds.extend(preds.cpu().numpy())
                all_targets.extend(y.cpu().numpy())

        all_preds = np.array(all_preds)
        all_targets = np.array(all_targets)

        val_loss = criterion(torch.tensor(all_preds), torch.tensor(all_targets)).item()
        val_acc = accuracy_score(all_targets, np.round(all_preds))
        try:
            val_auc = roc_auc_score(all_targets, all_preds)
        except ValueError:
            val_auc = float("nan")  # e.g. if validation set has only 1 class

        print(
            f"Epoch {epoch+1}/{num_epochs} "
            f"- Train Loss: {avg_train_loss:.4f} "
            f"- Val Loss: {val_loss:.4f} "
            f"- Val Acc: {val_acc:.4f} "
            f"- Val AUC: {val_auc:.4f}"
        )

    return model


In [71]:
df = pd.read_csv("transformer_train_data.csv")
numeric_cols = [
    "time_on_task", "fraction_of_hints_used", "attempt_count",
    "student_answer_count", "mean_correct", "mean_time_on_task",
    "started_problem_sets_count", "completed_problem_sets_count",
    "started_skill_builders_count", "mastered_skill_builders_count",
    "answered_problems_count", "mean_problem_correctness",
    "mean_problem_time_on_task", "mean_class_score"
]

categorical_cols = ["problem_type", "content_source", "skills", "tutoring_types"]

# Train
trained_model = train_model_all_features(df, numeric_cols, categorical_cols, num_epochs=10, batch_size=32, lr=1e-3, context=12)




Epoch 1/10 - Train Loss: 0.5821 - Val Loss: 0.5630 - Val Acc: 0.7177 - Val AUC: 0.8028
Epoch 2/10 - Train Loss: 0.5441 - Val Loss: 0.5296 - Val Acc: 0.7296 - Val AUC: 0.8091
Epoch 3/10 - Train Loss: 0.5255 - Val Loss: 0.5343 - Val Acc: 0.7343 - Val AUC: 0.8074
Epoch 4/10 - Train Loss: 0.5140 - Val Loss: 0.5290 - Val Acc: 0.7298 - Val AUC: 0.8060
Epoch 5/10 - Train Loss: 0.5047 - Val Loss: 0.5295 - Val Acc: 0.7359 - Val AUC: 0.8131
Epoch 6/10 - Train Loss: 0.4937 - Val Loss: 0.5518 - Val Acc: 0.7345 - Val AUC: 0.8055
Epoch 7/10 - Train Loss: 0.4862 - Val Loss: 0.5456 - Val Acc: 0.7214 - Val AUC: 0.8086
Epoch 8/10 - Train Loss: 0.4822 - Val Loss: 0.5525 - Val Acc: 0.7305 - Val AUC: 0.8073
Epoch 9/10 - Train Loss: 0.4677 - Val Loss: 0.5381 - Val Acc: 0.7352 - Val AUC: 0.8091
Epoch 10/10 - Train Loss: 0.4609 - Val Loss: 0.5423 - Val Acc: 0.7308 - Val AUC: 0.8056


In [72]:
from sklearn.metrics import accuracy_score, roc_auc_score


def evaluate_model(model, dataset, batch_size=64):
    loader = DataLoader(dataset, batch_size=batch_size)
    model.eval()

    all_preds, all_targets = [], []
    with torch.no_grad():
        for skills, correct, y in loader:
            preds = model(skills, correct)
            all_preds.extend(preds.cpu().numpy())
            all_targets.extend(y.cpu().numpy())

    # Round predictions for accuracy
    acc = accuracy_score(all_targets, np.round(all_preds))

    # AUC (may fail if all y are 0 or 1)
    try:
        auc = roc_auc_score(all_targets, all_preds)
    except ValueError:
        auc = float("nan")

    return acc, auc


In [73]:
# Build dataset again
dataset = SAKTDatasetAllFeatures(df, numeric_cols, categorical_cols, context=context, target_col="next_correct")

# Evaluate
acc, auc = evaluate_model(trained_model, dataset)
print(f"Final Accuracy: {acc:.4f}, Final AUC: {auc:.4f}")

Final Accuracy: 0.7831, Final AUC: 0.8665


##Testing the model

###Test dataset preprocessing

In [74]:
df1 = pd.read_csv("test_data.csv")
df1 = df1.sort_values(["student_id", "start_time"]).reset_index(drop=True)
df1["correct"] = df1["correct"].fillna(False).astype(bool)

num_cols = df1.select_dtypes(include=["float64", "int64"]).columns
cat_cols = df1.select_dtypes(include=["object", "bool"]).columns

for col in num_cols:
    if df1[col].isnull().any():
        df1[col] = df1[col].fillna(df1[col].median())

for col in cat_cols:
    if df1[col].isnull().any():
        df1[col] = df1[col].fillna("unknown")


# Load encoders and scaler
with open("label_encoders.pkl", "rb") as f:
    label_encoders = pickle.load(f)

for col in ["problem_type", "content_source", "skills", "tutoring_types","answer_before_tutoring","account_creation_date"]:
    df1[col] = df1[col].astype(str)

    # Replace unseen categories with "unknown"
    df1[col] = df1[col].apply(lambda x: x if x in label_encoders[col].classes_ else "unknown")

    # If "unknown" not in classes_, add it
    if "unknown" not in label_encoders[col].classes_:
        label_encoders[col].classes_ = np.append(label_encoders[col].classes_, "unknown")

    # Transform using training encoder
    df1[col] = label_encoders[col].transform(df1[col])



with open("scaler.pkl", "rb") as f:
    scaler = pickle.load(f)

# Transform test data using **fitted scaler**
df1[numeric_features] = scaler.transform(df1[numeric_features])

df1 = df1.drop(columns=["start_time"])
for col in df1.select_dtypes(include="bool").columns:
    df1[col] = df1[col].astype(int)

df1.to_csv("transformer_test_data.csv", index=False)



  df1["correct"] = df1["correct"].fillna(False).astype(bool)


In [75]:
# ----------------------
# 1. Prepare Test Dataset
# ----------------------
test_df = pd.read_csv("transformer_test_data.csv")  # preprocessed already
context = 12  # same as training

numeric_cols = [
    "time_on_task", "fraction_of_hints_used", "attempt_count",
    "student_answer_count", "mean_correct", "mean_time_on_task",
    "started_problem_sets_count", "completed_problem_sets_count",
    "started_skill_builders_count", "mastered_skill_builders_count",
    "answered_problems_count", "mean_problem_correctness",
    "mean_problem_time_on_task", "mean_class_score"
]

categorical_cols = ["problem_type", "content_source", "skills", "tutoring_types"]


test_dataset = SAKTDatasetAllFeatures(
    test_df, numeric_cols=numeric_cols, categorical_cols=categorical_cols,
    context=context, target_col='next_correct'
)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

# ----------------------
# 2. Run Predictions
# ----------------------
model = trained_model
model.eval()

all_preds, all_targets = [], []

with torch.no_grad():
    for numeric, categorical, y in test_loader:
        preds = model(numeric, categorical)
        all_preds.extend(preds.cpu().numpy())
        all_targets.extend(y.cpu().numpy())

# ----------------------
# 3. Convert to 0/1 predictions
# ----------------------
pred_labels = [1 if p > 0.5 else 0 for p in all_preds]

# ----------------------
# 4. Evaluate
# ----------------------

from sklearn.metrics import accuracy_score, roc_auc_score
accuracy = accuracy_score(all_targets, pred_labels)
auc = roc_auc_score(all_targets, all_preds)
print(f"Test Accuracy: {accuracy:.4f}")
print(f"Test AUC: {auc:.4f}")



Test Accuracy: 0.6809
Test AUC: 0.7457
