In [399]:
import numpy as np
import pyGMs as gm
import matplotlib.pyplot as plt
import time
import pyGMs.ising
import pickle
import numpy as np
from itertools import product
import pandas as pd
from math import log, sqrt
import os

### Get Data

In [400]:
with open('user_dict.pkl', 'rb') as f:
    data = pickle.load(f)

print(type(data))
print(data)

<class 'dict'>
{1: array([ 5., nan, nan, ..., nan, nan, nan], shape=(3883,)), 2: array([nan, nan, nan, ..., nan, nan, nan], shape=(3883,)), 3: array([nan, nan, nan, ..., nan, nan, nan], shape=(3883,)), 4: array([nan, nan, nan, ..., nan, nan, nan], shape=(3883,)), 5: array([nan, nan, nan, ..., nan, nan, nan], shape=(3883,)), 6: array([ 4., nan, nan, ..., nan, nan, nan], shape=(3883,)), 7: array([nan, nan, nan, ..., nan, nan, nan], shape=(3883,)), 8: array([ 4., nan, nan, ..., nan, nan, nan], shape=(3883,)), 9: array([ 5., nan, nan, ..., nan, nan, nan], shape=(3883,)), 10: array([ 5.,  5., nan, ..., nan, nan, nan], shape=(3883,)), 11: array([nan, nan, nan, ..., nan, nan, nan], shape=(3883,)), 12: array([nan, nan, nan, ..., nan, nan, nan], shape=(3883,)), 13: array([nan,  3., nan, ..., nan, nan, nan], shape=(3883,)), 14: array([nan, nan, nan, ..., nan, nan, nan], shape=(3883,)), 15: array([nan, nan, nan, ..., nan, nan, nan], shape=(3883,)), 16: array([nan, nan, nan, ..., nan, nan, nan], s

In [401]:
user_ids = sorted(data.keys())
num_users = len(user_ids)
num_movies = len(data[user_ids[0]])

X = np.full((num_users, num_movies), np.nan)

for i, user_id in enumerate(user_ids):
    X[i] = data[user_id]

print("Matrix shape:", X.shape)
print("Example row (User 1):", X[0][:5])

Matrix shape: (6040, 3883)
Example row (User 1): [ 5. nan nan nan nan]


In [402]:
Xte = X[:1000]
Xtr = X[1000:]
print("X_te shape:", Xte.shape)
print("X_tr shape:", Xtr.shape)

X_te shape: (1000, 3883)
X_tr shape: (5040, 3883)


In [403]:
Xtr_missing = np.where(np.isnan(Xtr), -1, Xtr).astype(int)
Xte_missing = np.where(np.isnan(Xte), -1, Xte).astype(int)

In [404]:
Xtr = (Xtr >= 4).astype(int)
Xte = (Xte >= 4).astype(int)

### Model

In [405]:
# ising_mw = gm.ising.fit_mweight(Xtr[:20], C=.9, threshold=6e-2)

# # plt.figure(figsize=(8,8));
# # gm.drawMarkovGraph(ising_mw,labels=short)

# print("Pseudo-likelihood: ",ising_mw.pseudolikelihood( Xtr ).mean())

In [406]:
def impute_missing(model, Xobs):
    m,n = Xobs.shape
    Xhat = np.copy(Xobs);
    for j in range(m):
        x_obs = {i:Xobs[j,i] for i in range(n) if Xobs[j,i] in (0,1)}
        x_unobs = [i for i in range(n) if Xobs[j,i] not in (0,1)]
        if not x_unobs: continue
        cond = gm.GraphModel([f.condition(x_obs) for f in model.factorsWithAny(x_unobs)])
        for x in cond.X:
            if x.states == 0: x.states = 1;  # fix a bug in GraphModel behavior for missing vars...
        jt = gm.wmb.JTree(cond, weights=1e-6) # 0: for maximization
        x_hat = jt.argmax();
        for i in x_unobs: Xhat[j,i] = x_hat[i]
    return Xhat

In [407]:
# Xte_hat = impute_missing(ising_mw, Xte_missing[:20])
# print('Error rate:', np.mean((Xte_hat != Xte_missing[:20])[Xte_missing[:20] < 0]))


In [408]:
# Experiment setup
input_sizes = [50, 100, 500, 1000]
all_results = []

for m in input_sizes:
    print(f"\n=== Running experiments for m={m} ===")
    
    print("[INFO] Preparing training data...")
    Xtr_subset = Xtr[:m]
    m_subset, n_subset = Xtr_subset.shape
    
    print("[INFO] Computing epsilon and learning rates...")
    epsilon = sqrt(log(n_subset) / m_subset)
    learning_rates = {
        'fast': 1 - 0.5 * epsilon,
        'default': 1 - 1.0 * epsilon,
        'slow': 1 - 1.5 * epsilon
    }

    # param_grid = {
    #     'C': [2, 5, 10, 20],
    #     'threshold': [5e-2, 1e-1, 2e-1],
    #     'lr_type': ['default', 'fast', 'slow']
    # }

    param_grid = {
    'C': [1, 2, 5, 10],
    'threshold': [5e-2],
    'lr_type': ['slow']
    }

    for C, threshold, lr_type in product(param_grid['C'], param_grid['threshold'], param_grid['lr_type']):
        lr = learning_rates[lr_type]
        print(f"\n[INFO] Testing m={m}, C={C}, threshold={threshold}, lr_type={lr_type} (lr={lr:.4f})")

        # Training
        print("[INFO] Starting training...")
        train_start = time.time()
        model = gm.ising.fit_mweight(
            Xtr_subset,
            C=C,
            threshold=threshold,
            learning_rate=lr
        )
        train_time = time.time() - train_start
        print(f"[INFO] Training complete in {train_time:.2f} seconds.")

        # # Prediction
        # print("[INFO] Starting prediction on test set...")
        # eval_start = time.time()
        # Xte_hat = impute_missing(model, Xte_missing)
        # eval_time = time.time() - eval_start
        # print(f"[INFO] Prediction complete in {eval_time:.2f} seconds.")

        # Metrics
        print("[INFO] Calculating evaluation metrics...")
        # error_rate = np.mean((Xte_hat != Xte_missing)[Xte_missing < 0])
        pll_train = model.pseudolikelihood(Xtr_subset).mean()
        pll_test = model.pseudolikelihood(Xte).mean()
        print(f"Train Pseudo-log-likelihood: {pll_train:.4f}")
        print(f"Test Pseudo-log-likelihood: {pll_test:.4f}")

        result = {
            'input_size': m,
            'C': C,
            'threshold': threshold,
            'lr_type': lr_type,
            'lr_value': lr,
            # 'error_rate': error_rate,
            # 'accuracy': 1 - error_rate if not np.isnan(error_rate) else np.nan,
            'train_pseudolikelihood': pll_train,
            'test_pseudolikelihood': pll_test,
            'training_time_seconds': train_time
            # 'prediction_time_seconds': eval_time,
        }

        # Append result and immediately save
        output_path = 'results/ising_model_size_experiment_latest.csv'
        os.makedirs('results', exist_ok=True)

        write_header = not os.path.exists(output_path)
        results_df = pd.DataFrame([result])
        results_df.to_csv(output_path, mode='a', header=write_header, index=False)
        print(f"[INFO] Appended and saved result for m={m}, C={C}, threshold={threshold}, lr_type={lr_type}")


=== Running experiments for m=50 ===
[INFO] Preparing training data...
[INFO] Computing epsilon and learning rates...

[INFO] Testing m=50, C=1, threshold=0.05, lr_type=slow (lr=0.3902)
[INFO] Starting training...


KeyboardInterrupt: 