In [1]:
import pandas as pd
from pathlib import Path

df = pd.read_csv(Path("../fsrs-benchmark/dataset/3.csv"))

In [2]:
from itertools import accumulate

def cum_concat(x):
    return list(accumulate(x))

t_history = df.groupby("card_id", group_keys=False)["delta_t"].apply(
    lambda x: cum_concat([[i] for i in x])
)
r_history = df.groupby("card_id", group_keys=False)["rating"].apply(
    lambda x: cum_concat([[i] for i in x])
)
df["r_history"] = [
    ",".join(map(str, item[:-1])) for sublist in r_history for item in sublist
]
df["t_history"] = [
    ",".join(map(str, item[:-1])) for sublist in t_history for item in sublist
]
df["y"] = df["rating"].map(lambda x: 1 if x > 1 else 0)

In [3]:
df = df[df["delta_t"] != 0].copy()
df["i"] = df.groupby("card_id").cumcount() + 1
df["first_rating"] = df["r_history"].map(lambda x: x[0] if len(x) > 0 else "")
short_term_df = df[df["i"] == 2]
short_term_df.head()

Unnamed: 0,card_id,review_th,delta_t,rating,r_history,t_history,y,i,first_rating
16,0,100,7,1,1111111111113134,-1000000000000000,0,2,1
39,1,97,7,3,133,-100,1,2,1
63,2,117,7,1,1111124,-1000000,0,2,1
104,3,108,7,1,11134,-10000,0,2,1
115,4,96,7,3,33,-10,1,2,3


In [4]:
import numpy as np
from scipy.optimize import minimize
from sklearn.metrics import root_mean_squared_error
from fsrs_optimizer import power_forgetting_curve


history_to_stability = dict()

for r_history in short_term_df.r_history.value_counts().index:
    group = short_term_df[short_term_df["r_history"] == r_history].groupby("delta_t").agg({"y": ["mean", "count"]}).reset_index()
    delta_t = group["delta_t"]
    recall = group["y"]["mean"]
    count = group["y"]["count"]
    init_s0 = 1
    def loss(stability):
        y_pred = power_forgetting_curve(delta_t, stability)
        logloss = sum(
            -(recall * np.log(y_pred) + (1 - recall) * np.log(1 - y_pred))
            * count
        )
        l1 = np.abs(stability - init_s0) / 16
        return logloss + l1

    res = minimize(
        loss,
        x0=init_s0,
        bounds=((0.01, 300),),
    )
    params = res.x
    stability = params[0]
    predict_recall = power_forgetting_curve(delta_t, *params)
    rmse = root_mean_squared_error(
        recall, predict_recall, sample_weight=count
    )
    history_to_stability[r_history] = (round(stability, 2), count.sum())

In [5]:
rating_to_stability = dict()

for first_rating in short_term_df.first_rating.value_counts().index:
    group = short_term_df[short_term_df["first_rating"] == first_rating].groupby("delta_t").agg({"y": ["mean", "count"]}).reset_index()
    delta_t = group["delta_t"]
    recall = group["y"]["mean"]
    count = group["y"]["count"]
    init_s0 = 1
    def loss(stability):
        y_pred = power_forgetting_curve(delta_t, stability)
        logloss = sum(
            -(recall * np.log(y_pred) + (1 - recall) * np.log(1 - y_pred))
            * count
        )
        l1 = np.abs(stability - init_s0) / 16
        return logloss + l1

    res = minimize(
        loss,
        x0=init_s0,
        bounds=((0.01, 300),),
    )
    params = res.x
    stability = params[0]
    predict_recall = power_forgetting_curve(delta_t, *params)
    rmse = root_mean_squared_error(
        recall, predict_recall, sample_weight=count
    )
    rating_to_stability[first_rating] = (round(stability, 2), count.sum())

In [6]:
stability = list(history_to_stability.items())
stability = sorted(stability)
threshold = sorted(stability, key=lambda x: x[1][1], reverse=True)[10][1][1]
for i in range(len(stability)):
    if stability[i][1][1] < threshold:
        continue
    print(stability[i])

print("-----------------")

stability = list(rating_to_stability.items())
stability = sorted(stability)
for i in range(len(stability)):
    print(stability[i])

('1', (0.01, 13))
('1,1,1,1,1,1,1,1,1,3,3', (0.07, 6))
('1,1,1,1,1,1,1,1,3,3', (0.01, 8))
('1,1,1,1,1,1,1,3,3', (0.03, 9))
('1,1,1,1,1,1,3,3', (0.07, 12))
('1,1,1,1,1,3,3', (0.16, 19))
('1,1,1,1,3,3', (0.04, 28))
('1,1,1,3,3', (0.09, 56))
('1,1,3,3', (0.48, 88))
('1,3', (0.83, 6))
('1,3,3', (0.81, 107))
('3,3', (6.0, 56))
-----------------
('1', (0.25, 430))
('2', (2.9, 1))
('3', (5.03, 64))


In [7]:
short_term_df = short_term_df[short_term_df["r_history"].isin(history_to_stability.keys())].copy()

short_term_df["stability_by_history"] = short_term_df["r_history"].map(lambda x: history_to_stability[x][0])
short_term_df["stability_by_rating"] = short_term_df["first_rating"].map(lambda x: rating_to_stability[x][0])

In [8]:
from fsrs_optimizer import load_brier

short_term_df["predict_recall_by_history"] = power_forgetting_curve(short_term_df["delta_t"], short_term_df["stability_by_history"])
short_term_df["predict_recall_by_rating"] = power_forgetting_curve(short_term_df["delta_t"], short_term_df["stability_by_rating"])


def rmse_bin(predictions, real, bins=20):
    brier = load_brier(predictions, real, bins=bins)
    bin_prediction_means = brier["detail"]["bin_prediction_means"]
    bin_correct_means = brier["detail"]["bin_correct_means"]
    bin_counts = brier["detail"]["bin_counts"]
    mask = bin_counts > 0

    rmse = root_mean_squared_error(
        bin_correct_means[mask],
        bin_prediction_means[mask],
        sample_weight=bin_counts[mask],
    )
    return rmse

print(rmse_bin(short_term_df["predict_recall_by_history"], short_term_df["y"]))
print(rmse_bin(short_term_df["predict_recall_by_rating"], short_term_df["y"]))

0.06155138064612561
0.09643249471074404


In [9]:
from sklearn.metrics import log_loss

print(log_loss(short_term_df["y"], short_term_df["predict_recall_by_history"]))
print(log_loss(short_term_df["y"], short_term_df["predict_recall_by_rating"]))

0.5006925485309014
0.6089876139058069
