In [22]:
import os
import numpy as np
import pandas as pd
import torch
from torch import nn
from transformers import AutoTokenizer, AutoModel

# ------------ paths & constants ------------
CSV_PATH   = "../data/cleaned_courses.csv"   # adjust if needed
MODEL_DIR  = "model_v5_NEW_VIEWS"                      # where you saved training
TEXT_COLUMN = "TextForBERT"                  # same as training
MODEL_NAME = "bert-base-uncased"             # same as training
PROJ_DIM   = 256                             # same as training
MAX_LEN    = 64

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)


Using device: cuda


In [23]:
class CourseEncoder(nn.Module):
    def __init__(self, base_model_name: str = MODEL_NAME, proj_dim: int = PROJ_DIM):
        super().__init__()
        self.bert = AutoModel.from_pretrained(base_model_name)
        hidden = self.bert.config.hidden_size
        self.proj = nn.Sequential(
            nn.Linear(hidden, hidden),
            nn.ReLU(),
            nn.Linear(hidden, proj_dim)
        )

    def forward(self, input_ids, attention_mask):
        out = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            output_hidden_states=False
        )
        last_hidden = out.last_hidden_state  # (B, L, H)

        # mean pooling
        mask = attention_mask.unsqueeze(-1).float()  # (B, L, 1)
        masked = last_hidden * mask
        summed = masked.sum(dim=1)
        counts = mask.sum(dim=1).clamp(min=1e-9)
        pooled = summed / counts                    # (B, H)

        z = self.proj(pooled)
        z = nn.functional.normalize(z, p=2, dim=-1)
        return z

In [24]:
# Load tokenizer from the fine-tuned directory
tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)

# Rebuild CourseEncoder and load the state dict
model = CourseEncoder(base_model_name=MODEL_NAME, proj_dim=PROJ_DIM)
state_dict = torch.load(os.path.join(MODEL_DIR, "pytorch_model.bin"), map_location=device)
model.load_state_dict(state_dict)
model.to(device)
model.eval()

print("Model loaded from", MODEL_DIR)

# Load course data
df = pd.read_csv(CSV_PATH)
if TEXT_COLUMN not in df.columns:
    raise ValueError(f"Column {TEXT_COLUMN} not in CSV")

course_texts = df[TEXT_COLUMN].astype(str).tolist()
print(f"Loaded {len(course_texts)} courses.")


Model loaded from model_v5_NEW_VIEWS
Loaded 518 courses.


In [25]:
@torch.no_grad()
def embed_texts(texts, tokenizer, model, batch_size=32, max_len=MAX_LEN):
    all_embs = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        enc = tokenizer(
            batch,
            padding=True,
            truncation=True,
            max_length=max_len,
            return_tensors="pt"
        ).to(device)

        z = model(enc["input_ids"], enc["attention_mask"])  # (B, D), already normalized
        all_embs.append(z.cpu().numpy())
    return np.vstack(all_embs)  # (N, D)


In [26]:
course_embs = embed_texts(course_texts, tokenizer, model)
print("Course embeddings shape:", course_embs.shape)


Course embeddings shape: (518, 256)


In [27]:
def recommend_courses(
    query_text: str,
    top_k: int = 5,
    show_cols=("Code", "Title", "Faculty")
):
    # 1) Embed the query
    model.eval()
    with torch.no_grad():
        enc = tokenizer(
            [query_text],
            padding=True,
            truncation=True,
            max_length=MAX_LEN,
            return_tensors="pt"
        ).to(device)
        q_emb = model(enc["input_ids"], enc["attention_mask"])  # (1, D)
        q_emb = q_emb.cpu().numpy()[0]  # (D,)

    # 2) Cosine similarity since both sides are normalized
    sims = course_embs @ q_emb  # (N,)

    # 3) Top-K indices
    top_idx = np.argsort(-sims)[:top_k]

    results = []
    for rank, idx in enumerate(top_idx, start=1):
        row = df.iloc[idx]
        entry = {
            "rank": rank,
            "similarity": float(sims[idx]),
        }
        for c in show_cols:
            if c in df.columns:
                entry[c] = row[c]
        results.append(entry)

    # Pretty print
    print(f"\nQuery: {query_text}\n")
    for r in results:
        print(f"#{r['rank']}  (cos = {r['similarity']:.4f})")
        for c in show_cols:
            if c in r:
                print(f"   {c}: {r[c]}")
        print()

    return results


In [28]:
while True:
    q = input("\nDescribe the kind of course you're looking for (or 'quit'): ")
    if q.lower().strip() in {"q", "quit", "exit"}:
        break
    recommend_courses(q, top_k=5)


Query: I like chemistry

#1  (cos = 0.4196)
   Code: 3331
   Title: Application of Mathematical Methods to Chemical Engineering 
   Faculty: CHG

#2  (cos = 0.4164)
   Code: 1371
   Title: Numerical Methods and Engineering Computation in Chemical Engineering 
   Faculty: CHG

#3  (cos = 0.3926)
   Code: 8195
   Title: Advanced Numerical Methods in Chemical and Biological Engineering 
   Faculty: CHG

#4  (cos = 0.3729)
   Code: 3122
   Title: Chemical Engineering Practice 
   Faculty: CHG

#5  (cos = 0.3495)
   Code: 4116
   Title: Chemical Engineering Laboratory 
   Faculty: CHG


Query: I hate chemistry

#1  (cos = 0.3133)
   Code: 5398
   Title: Independent Engineering Study 
   Faculty: MCG

#2  (cos = 0.3045)
   Code: 5301
   Title: Soil and Water Conservation Engineering 
   Faculty: CVG

#3  (cos = 0.2984)
   Code: 4130
   Title: Advanced Environmental Engineering 
   Faculty: CVG

#4  (cos = 0.2917)
   Code: 5341
   Title: Turbomachinery 
   Faculty: MCG

#5  (cos = 0.2910)
  