In [1]:
import os
import csv
import pandas as pd
import time
import pickle
import sys
import datetime
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pickle
from scipy.optimize import minimize


In [2]:
print(os.getcwd())
# it should end with this: /AITutor_SeqModeling
# if not, run the next block


/local-scratch/localhome/pagand/projects/mygitsDaTu/AITutor_SeqModeling/KnowledgeTracking


In [3]:
# run if the current directory is not AITutor_SeqModeling
cwd = os.chdir(os.path.join(os.getcwd(), ".."))
print(os.getcwd())

/local-scratch/localhome/pagand/projects/mygitsDaTu/AITutor_SeqModeling


In [4]:
File_pickle = "data/KT_logs_annotated.pkl"

# read from pickle
df = pd.read_pickle(File_pickle)

df.head()

Unnamed: 0,username,skill,correct,time
0,a1,"[Supervised Learning, Classification Algorithms]",True,0.0
1,a1,"[Supervised Learning, Classification Algorithms]",False,3.7267
2,a2,"[Supervised Learning, Classification Algorithms]",False,0.0
3,a2,"[Supervised Learning, Classification Algorithms]",True,1.987467
4,a3,"[Supervised Learning, Classification Algorithms]",True,0.0


In [5]:
skills = pickle.load(open("data/Skill_hirereachy.pkl", "rb"))

In [None]:
# version 1: Only user params
# user_params = {}

# def initialize_user_params(user_id, skills):
#     user_params[user_id] = {}
#     for skill in skills.keys():
#         skill_params = skills[skill][-1]
#         user_params[user_id][skill] = {
#             "P(L)": skill_params[0],
#             "P(T)": skill_params[1],
#             "P(G)": skill_params[2],
#             "P(S)": skill_params[3]
#         }
# for user_id in df["username"].unique():
#     initialize_user_params(user_id, skills)

In [6]:
# version 2: seperate user-specific and skill specific parameters
def initialize_params(skills, user_ids):
    user_params = {}
    skill_params = {}
    for skill in skills.keys():
        skill_params[skill] = {
                "P(L)": skills[skill][-1][0],
                "P(T)": skills[skill][-1][1],
                "P(G)": skills[skill][-1][1],
                "P(S)": skills[skill][-1][3]
            }
        for user_id in user_ids:
            # assume all users have the same initial skill level
            # add prior knowledge here if exists
            user_params[user_id] = skill_params
    return skill_params, user_params

In [7]:
# Function to compute P(C_t|L_t, G, S)
def compute_prob_correctness(P_L, P_G, P_S, correct):
    if correct:
        return (1 - P_S) * P_L + P_G * (1 - P_L)
    else:
        return P_S * P_L + (1 - P_G) * (1 - P_L)

In [8]:
# (log-likelihood function)
def log_likelihood(interaction_log,  skill_params, user_params):
    log_likelihood = 0
    for _, row in interaction_log.iterrows():
        user_id = row["username"]
        skill_list = row["skill"]
        correctness = row["correct"]

        for skill in skill_list:
            # Retrieve user and skill parameters
            P_L = user_params[user_id][skill]["P(L)"]
            P_G = skill_params[skill]["P(G)"]
            P_S = skill_params[skill]["P(S)"]

            # Compute likelihood
            prob = compute_prob_correctness(P_L, P_G, P_S, correctness)
            log_likelihood += np.log(prob + 1e-9)

    return -log_likelihood

In [9]:
# Regularization term
def regularizer(user_params, skill_params, reg_lambda=0.1):
    regularization = 0
    for user_id, user_data in user_params.items():
        for skill, params in user_data.items():
            P_L_user = params["P(L)"]
            P_L_skill = skill_params[skill]["P(L)"]
            regularization += reg_lambda * (P_L_user - P_L_skill) ** 2
    return regularization

# EM Optimization: E-Step
def expectation_step(interaction_log, skill_params, user_params, reg_lambda=0.1):
    return log_likelihood(interaction_log, skill_params, user_params) + regularizer(user_params, skill_params, reg_lambda)


In [10]:
# EM Optimization: M-Step
def optimize_params(interaction_log, skill_params, user_params, skills, max_iter=50, reg_lambda=0.1):
    # Flatten parameters for optimization
    skill_flat = np.concatenate([list(skill_params[skill].values()) for skill in skills])
    user_flat = np.concatenate([list(user_params[user_id][skill]["P(L)"] for skill in skills)
                                 for user_id in user_params])

    def objective(flat_params):
        # Split into skill and user parameters
        skill_split = flat_params[:len(skills) * 4].reshape(len(skills), 4)
        user_split = flat_params[len(skills) * 4:].reshape(len(user_params), len(skills))

        # Update params
        for i, skill in enumerate(skills):
            skill_params[skill] = {"P(L)": skill_split[i, 0],
                                   "P(T)": skill_split[i, 1],
                                   "P(G)": skill_split[i, 2],
                                   "P(S)": skill_split[i, 3]}
        for j, user_id in enumerate(user_params):
            for k, skill in enumerate(skills):
                user_params[user_id][skill]["P(L)"] = user_split[j, k]

        return expectation_step(interaction_log, skill_params, user_params, reg_lambda)

    # Optimize
    result = minimize(objective, np.concatenate([skill_flat, user_flat]), method='L-BFGS-B', options={'maxiter': max_iter, 'disp':True,'maxfun':1, 'maxls': 1, 'ftol': 1e1, 'gtol': 1e1})
    return result

In [11]:
# Parent-Child Constraints
def enforce_constraints(user_params, skill_params, skills):
    for skill, skill_data in skills.items():
        parents = skill_data[-1]
        for parent in parents:
            if skill_params[skill]["P(L)"] >= skill_params[parent]["P(L)"]:
                skill_params[skill]["P(L)"] = skill_params[parent]["P(L)"] - 0.01 
            for user in user_params:
                parent_prob = user_params[user][parent]["P(L)"]
                child_prob = user_params[user][skill]["P(L)"]
                if child_prob >= parent_prob:
                    user_params[user][skill]["P(L)"] = parent_prob - 0.01  # Apply heuristic

In [12]:
# Full EM Algorithm
def run_em_with_constraints(interaction_log, skills, max_iter=1, reg_lambda=0.1):
    skill_params, user_params = initialize_params(skills, df["username"].unique())

    for iteration in range(max_iter):
        print(f"Iteration {iteration + 1}")

        # Optimize parameters
        result = optimize_params(interaction_log, skill_params, user_params, skills, reg_lambda=reg_lambda)

        # Enforce constraints
        enforce_constraints(user_params, skill_params, skills)

        # Output progress
        print(f"  Log-Likelihood: {-result.fun:.4f}")

    return skill_params, user_params

In [13]:
# Run the EM Algorithm
skill_params, user_params = run_em_with_constraints(df.iloc[:1000], skills)

# Output Results
print("Final Skill Parameters:")
for skill, params in skill_params.items():
    print(skill, params)

print("\nFinal User Parameters:")
for user_id, user_data in user_params.items():
    print(user_id)
    for skill, params in user_data.items():
        print(f"  {skill}: {params}")

Iteration 1


 This problem is unconstrained.
  log_likelihood += np.log(prob + 1e-9)


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =         2813     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  1.15075D+03    |proj g|=  3.26370D+02


KeyboardInterrupt: 