In [23]:
import pickle
import numpy as np
import pyGMs as gm
import time
import pandas as pd

In [24]:
with open('user_dict.pkl','rb') as f:
    user_dict = pickle.load(f)
with open('train_users.pkl','rb') as f:
    train_users = pickle.load(f)
with open('test_users.pkl','rb') as f:
    test_users = pickle.load(f)

LIKE_THRESHOLD = 4
n_movies = next(iter(user_dict.values())).shape[0]

In [25]:
def build_matrix(uids):
    M = np.empty((len(uids), n_movies), dtype=float)
    M[:] = np.nan
    for i, uid in enumerate(uids):
        M[i] = user_dict[uid]
    return M

def binarize(X):
    return np.where(
        np.isnan(X),
        np.nan,
        (X >= LIKE_THRESHOLD).astype(int)
    )

In [26]:
Xtr = build_matrix(train_users)
Xte = build_matrix(test_users)

Xtrb = binarize(Xtr)
Xteb = binarize(Xte)

Xtr_full = np.nan_to_num(Xtrb, nan=0).astype(int)
Xte_full = np.nan_to_num(Xteb, nan=0).astype(int)

In [27]:
counts = np.sum(~np.isnan(Xtrb), axis=0)
sums   = np.nansum(Xtrb, axis=0)
p_i = np.divide(
    sums,
    counts,
    out=np.full_like(sums, 0.5),
    where=(counts > 0)
)

factors = [gm.Factor([gm.Var(i,2)], [1-p_i[i], p_i[i]]) for i in range(n_movies)]
model0 = gm.GraphModel(factors, isLog=True)

In [28]:
def avg_ll(masked, full):
    lls = []
    for mrow, frow in zip(masked, full):
        observed = np.where(~np.isnan(mrow))[0]
        obs_list = observed.tolist()          
        if obs_list:                           
            ll = model0.logValue(frow, subset=obs_list)
            lls.append(ll[0] if isinstance(ll, np.ndarray) else ll)
    return np.mean(lls)

train_ll = avg_ll(Xtrb, Xtr_full)
test_ll  = avg_ll(Xteb, Xte_full)

In [29]:

sizes = [50, 100, 500, 1000, 2500, len(train_users)]
results = []

for sz in sizes:
    subset_train = train_users[:sz]
    Xtr   = build_matrix(subset_train)
    Xtrb  = binarize(Xtr)
    Xtr_f = np.nan_to_num(Xtrb, nan=0).astype(int)

    Xte   = build_matrix(test_users)
    Xteb  = binarize(Xte)
    Xte_f = np.nan_to_num(Xteb, nan=0).astype(int)

    t0 = time.time()
    counts = np.sum(~np.isnan(Xtrb), axis=0)
    sums   = np.nansum(Xtrb, axis=0)
    p_i = np.divide(
        sums,
        counts,
        out=np.full_like(sums, 0.5),
        where=(counts > 0)
    )
    factors = [gm.Factor([gm.Var(i,2)], [1-p_i[i], p_i[i]]) 
               for i in range(n_movies)]
    model0 = gm.GraphModel(factors, isLog=True)
    train_time_s = time.time() - t0

    t1 = time.time()
    pll = avg_ll(Xteb, Xte_f)
    pred_time_s = time.time() - t1

    preds = (p_i >= 0.5).astype(int)

    preds_mat = np.tile(preds, (Xte_f.shape[0], 1))

    mask = ~np.isnan(Xteb)

    accuracy = (preds_mat[mask] == Xte_f[mask]).mean()

    results.append({
        'train_size':       sz,
        'train_time_s':     train_time_s,
        'pred_time_s':      pred_time_s,
        'accuracy':         accuracy,
        'pseudolikelihood': pll
    })

In [30]:
df = pd.DataFrame(
    results,
    columns=['train_size','train_time_s','pred_time_s','accuracy','pseudolikelihood']
)

df

Unnamed: 0,train_size,train_time_s,pred_time_s,accuracy,pseudolikelihood
0,50,0.340264,0.807619,0.63976,92.754583
1,100,0.271227,0.653539,0.658951,93.074557
2,500,0.270178,0.650992,0.678251,93.012851
3,1000,0.296141,0.715901,0.681912,92.595851
4,2500,0.342703,0.69761,0.684399,92.389412
5,5040,0.384366,0.675544,0.685604,92.182959
