In [1]:
import pandas as pd
from pathlib import Path

# df = pd.read_csv(Path("../fsrs-benchmark/dataset/3.csv"))

In [2]:
from datetime import timedelta

next_day_starts_at = 4
timezone = "Europe/Moscow"

df = pd.read_csv("../fsrs-optimizer/dataset/Main_27_04_2023_apkg/revlog.csv")
df["review_date"] = pd.to_datetime(df["review_time"] // 1000, unit="s")
df["review_date"] = df["review_date"].dt.tz_localize("UTC").dt.tz_convert(timezone)
df.drop(df[df["review_date"].dt.year < 2006].index, inplace=True)
df["real_days"] = df["review_date"] - timedelta(hours=int(next_day_starts_at))
df["real_days"] = pd.DatetimeIndex(
    df["real_days"].dt.floor("D", ambiguous="infer", nonexistent="shift_forward")
).to_julian_date()
df["delta_t"] = df.real_days.diff()
df.fillna({"delta_t": 0}, inplace=True)
df["i"] = df.groupby("card_id").cumcount() + 1
df.loc[df["i"] == 1, "delta_t"] = -1
df.rename(columns={"review_rating": "rating"}, inplace=True)

In [3]:
from itertools import accumulate


def cum_concat(x):
    return list(accumulate(x))


t_history = df.groupby("card_id", group_keys=False)["delta_t"].apply(
    lambda x: cum_concat([[i] for i in x])
)
r_history = df.groupby("card_id", group_keys=False)["rating"].apply(
    lambda x: cum_concat([[i] for i in x])
)
df["r_history"] = [
    ",".join(map(str, item[:-1])) for sublist in r_history for item in sublist
]
df["t_history"] = [
    ",".join(map(str, item[:-1])) for sublist in t_history for item in sublist
]
df["y"] = df["rating"].map(lambda x: 1 if x > 1 else 0)

In [4]:
df = df[df["delta_t"] != 0].copy()
df["i"] = df.groupby("card_id").cumcount() + 1
df["first_rating"] = df["r_history"].map(lambda x: x[0] if len(x) > 0 else "")
short_term_df = df[df["i"] == 2]

In [5]:
import numpy as np
from scipy.optimize import minimize
from sklearn.metrics import root_mean_squared_error
from fsrs_optimizer import power_forgetting_curve


history_to_stability = dict()

for r_history in short_term_df.r_history.value_counts().index:
    group = (
        short_term_df[short_term_df["r_history"] == r_history]
        .groupby("delta_t")
        .agg({"y": ["mean", "count"]})
        .reset_index()
    )
    delta_t = group["delta_t"]
    recall = group["y"]["mean"]
    count = group["y"]["count"]
    init_s0 = 1

    def loss(stability):
        y_pred = power_forgetting_curve(delta_t, stability)
        logloss = sum(
            -(recall * np.log(y_pred) + (1 - recall) * np.log(1 - y_pred)) * count
        )
        l1 = np.abs(stability - init_s0) / 16
        return logloss + l1

    res = minimize(
        loss,
        x0=init_s0,
        bounds=((0.01, 300),),
    )
    params = res.x
    stability = params[0]
    predict_recall = power_forgetting_curve(delta_t, *params)
    rmse = root_mean_squared_error(recall, predict_recall, sample_weight=count)
    history_to_stability[r_history] = (round(stability, 2), count.sum())

In [6]:
rating_to_stability = dict()

for first_rating in short_term_df.first_rating.value_counts().index:
    group = (
        short_term_df[short_term_df["first_rating"] == first_rating]
        .groupby("delta_t")
        .agg({"y": ["mean", "count"]})
        .reset_index()
    )
    delta_t = group["delta_t"]
    recall = group["y"]["mean"]
    count = group["y"]["count"]
    init_s0 = 1

    def loss(stability):
        y_pred = power_forgetting_curve(delta_t, stability)
        logloss = sum(
            -(recall * np.log(y_pred) + (1 - recall) * np.log(1 - y_pred)) * count
        )
        l1 = np.abs(stability - init_s0) / 16
        return logloss + l1

    res = minimize(
        loss,
        x0=init_s0,
        bounds=((0.01, 300),),
    )
    params = res.x
    stability = params[0]
    predict_recall = power_forgetting_curve(delta_t, *params)
    rmse = root_mean_squared_error(recall, predict_recall, sample_weight=count)
    rating_to_stability[first_rating] = (round(stability, 2), count.sum())

In [7]:
stability = list(history_to_stability.items())
stability = sorted(stability)
threshold = sorted(stability, key=lambda x: x[1][1], reverse=True)[10][1][1]
for i in range(len(stability)):
    if stability[i][1][1] < threshold:
        continue
    print(stability[i])

print("-----------------")

stability = list(rating_to_stability.items())
stability = sorted(stability)
for i in range(len(stability)):
    print(stability[i])

('2', (0.23, 7276))
('2,2,3,3', (1.72, 133))
('2,3', (1.48, 136))
('2,3,3', (2.16, 435))
('2,3,3,3', (9.01, 109))
('3', (1.18, 13152))
('3,2', (0.5, 279))
('3,3', (2.18, 2080))
('3,3,3', (15.55, 227))
('3,4', (10.48, 154))
('4', (181.33, 4570))
-----------------
('1', (0.5, 82))
('2', (0.28, 8894))
('3', (1.24, 16289))
('4', (181.33, 4570))


In [8]:
short_term_df = short_term_df[
    short_term_df["r_history"].isin(history_to_stability.keys())
].copy()

short_term_df["stability_by_history"] = short_term_df["r_history"].map(
    lambda x: history_to_stability[x][0]
)
short_term_df["stability_by_rating"] = short_term_df["first_rating"].map(
    lambda x: rating_to_stability[x][0]
)

In [9]:
from fsrs_optimizer import load_brier

short_term_df["predict_recall_by_history"] = power_forgetting_curve(
    short_term_df["delta_t"], short_term_df["stability_by_history"]
)
short_term_df["predict_recall_by_rating"] = power_forgetting_curve(
    short_term_df["delta_t"], short_term_df["stability_by_rating"]
)


def rmse_bin(predictions, real, bins=20):
    brier = load_brier(predictions, real, bins=bins)
    bin_prediction_means = brier["detail"]["bin_prediction_means"]
    bin_correct_means = brier["detail"]["bin_correct_means"]
    bin_counts = brier["detail"]["bin_counts"]
    mask = bin_counts > 0

    rmse = root_mean_squared_error(
        bin_correct_means[mask],
        bin_prediction_means[mask],
        sample_weight=bin_counts[mask],
    )
    return rmse


print(rmse_bin(short_term_df["predict_recall_by_history"], short_term_df["y"]))
print(rmse_bin(short_term_df["predict_recall_by_rating"], short_term_df["y"]))

0.07260396358288007
0.07315641369077919


In [10]:
from sklearn.metrics import log_loss

print(log_loss(short_term_df["y"], short_term_df["predict_recall_by_history"]))
print(log_loss(short_term_df["y"], short_term_df["predict_recall_by_rating"]))

0.4086080804792899
0.42574583160823953
