# Evaluation of FSRS online and SM-15

The result of the evaluation shows that the difference in performance between FSRS online and SM-15 is not significant. FSRS online has reached the level of SM-15.

In [1]:
import scipy
import json
import numpy as np

with open("./evaluation.json", 'r') as f:  
    evaluation = json.load(f)

print(f"Number of users: {len(evaluation)}")

size = []
for item in evaluation:
    size.append(item['size'])

print(f"Number of repetitions: {sum(size)}")

metrics = ["RMSE", "MAE", "log_loss"]

print()

for m in metrics:
    fsrs_online = []
    sm15 = []
    fsrs_offline = []
    for item in evaluation:
        fsrs_online.append(item['fsrs_online'][m])
        sm15.append(item['sm15'][m])
        fsrs_offline.append(item['fsrs_offline'][m])

    fsrs_online = np.array(fsrs_online)
    sm15 = np.array(sm15)
    fsrs_offline = np.array(fsrs_offline)

    print(f"Metric: {m}")
    print(f"FSRS Online\tmean: {fsrs_online.mean():.4f}\tstd: {fsrs_online.std():.4f}")
    print(f"SM15\t\tmean: {sm15.mean():.4f}\tstd: {sm15.std():.4f}")
    print(f"FSRS Offline\tmean: {fsrs_offline.mean():.4f}\tstd: {fsrs_offline.std():.4f}")
    print()
    print("FSRS Online vs SM15")
    print(scipy.stats.ttest_rel(fsrs_online, sm15))
    print(scipy.stats.wilcoxon(fsrs_online, sm15))
    print("FSRS Offline vs SM15")
    print(scipy.stats.ttest_rel(fsrs_offline, sm15))
    print(scipy.stats.wilcoxon(fsrs_offline, sm15))
    print()

fsrs_online_universal_metric_by_sm15 = []
fsrs_offline_universal_metric_by_sm15 = []
sm15_universal_metric_by_fsrs_online = []
sm15_universal_metric_by_fsrs_offline = []

for item in evaluation:
    fsrs_online_universal_metric_by_sm15.append(item['fsrs_online']['universal_metric_by_sm15'])
    fsrs_offline_universal_metric_by_sm15.append(item['fsrs_offline']['universal_metric_by_sm15'])
    sm15_universal_metric_by_fsrs_online.append(item['sm15']['universal_metric_by_fsrs_online'])
    sm15_universal_metric_by_fsrs_offline.append(item['sm15']['universal_metric_by_fsrs_offline'])

fsrs_online_universal_metric_by_sm15 = np.array(fsrs_online_universal_metric_by_sm15)
fsrs_offline_universal_metric_by_sm15 = np.array(fsrs_offline_universal_metric_by_sm15)
sm15_universal_metric_by_fsrs_online = np.array(sm15_universal_metric_by_fsrs_online)
sm15_universal_metric_by_fsrs_offline = np.array(sm15_universal_metric_by_fsrs_offline)

print(f"Universal Metric")
print(f"FSRS Online by SM15\tmean: {fsrs_online_universal_metric_by_sm15.mean():.4f}\tstd: {fsrs_online_universal_metric_by_sm15.std():.4f}")
print(f"SM15 by FSRS Online\tmean: {sm15_universal_metric_by_fsrs_online.mean():.4f}\tstd: {sm15_universal_metric_by_fsrs_online.std():.4f}")
print(f"FSRS Offline by SM15\tmean: {fsrs_offline_universal_metric_by_sm15.mean():.4f}\tstd: {fsrs_offline_universal_metric_by_sm15.std():.4f}")
print(f"SM15 by FSRS Offline\tmean: {sm15_universal_metric_by_fsrs_offline.mean():.4f}\tstd: {sm15_universal_metric_by_fsrs_offline.std():.4f}")
print()
print("FSRS Online vs SM15")
print(scipy.stats.ttest_rel(fsrs_online_universal_metric_by_sm15, sm15_universal_metric_by_fsrs_online))
print(scipy.stats.wilcoxon(fsrs_online_universal_metric_by_sm15, sm15_universal_metric_by_fsrs_online))
print("FSRS Offline vs SM15")
print(scipy.stats.ttest_rel(fsrs_offline_universal_metric_by_sm15, sm15_universal_metric_by_fsrs_offline))
print(scipy.stats.wilcoxon(fsrs_offline_universal_metric_by_sm15, sm15_universal_metric_by_fsrs_offline))

Number of users: 14
Number of repetitions: 213644

Metric: RMSE
FSRS Online	mean: 0.0981	std: 0.0516
SM15		mean: 0.1227	std: 0.0418
FSRS Offline	mean: 0.0588	std: 0.0306

FSRS Online vs SM15
TtestResult(statistic=-1.7889508439977053, pvalue=0.09693610797385332, df=13)
WilcoxonResult(statistic=22.0, pvalue=0.0579833984375)
FSRS Offline vs SM15
TtestResult(statistic=-8.73606963005117, pvalue=8.422934991953697e-07, df=13)
WilcoxonResult(statistic=0.0, pvalue=0.0001220703125)

Metric: MAE
FSRS Online	mean: 0.0651	std: 0.0418
SM15		mean: 0.0789	std: 0.0370
FSRS Offline	mean: 0.0401	std: 0.0256

FSRS Online vs SM15
TtestResult(statistic=-1.2321783545677771, pvalue=0.23970415925776878, df=13)
WilcoxonResult(statistic=23.0, pvalue=0.067626953125)
FSRS Offline vs SM15
TtestResult(statistic=-7.887414556631847, pvalue=2.610627279018964e-06, df=13)
WilcoxonResult(statistic=0.0, pvalue=0.0001220703125)

Metric: log_loss
FSRS Online	mean: 0.3768	std: 0.1347
SM15		mean: 0.4106	std: 0.1756
FSRS Offlin

# Reference

- fsrs: https://github.com/open-spaced-repetition/fsrs4anki/wiki/The-Algorithm
- sm15: https://supermemo.guru/wiki/Algorithm_SM-15
- rmse: https://en.wikipedia.org/wiki/Root-mean-square_deviation
- mae: https://en.wikipedia.org/wiki/Mean_absolute_error
- log_loss: https://en.wikipedia.org/wiki/Cross-entropy
- universal_metric: https://supermemo.guru/wiki/Universal_metric_for_cross-comparison_of_spaced_repetition_algorithms
- ttest_rel: https://en.wikipedia.org/wiki/Student%27s_t-test#Dependent_t-test_for_paired_samples
- wilcoxon: https://en.wikipedia.org/wiki/Wilcoxon_signed-rank_test