From 404e1f7ed3dc8391538c0c56a6b0bc12242cd637 Mon Sep 17 00:00:00 2001 From: Jarrett Ye Date: Wed, 24 Apr 2024 15:54:16 +0800 Subject: [PATCH 1/6] use median for recall_costs and learn_cost --- src/fsrs_optimizer/fsrs_optimizer.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/fsrs_optimizer/fsrs_optimizer.py b/src/fsrs_optimizer/fsrs_optimizer.py index f400828..06bb0d9 100644 --- a/src/fsrs_optimizer/fsrs_optimizer.py +++ b/src/fsrs_optimizer/fsrs_optimizer.py @@ -599,14 +599,15 @@ def create_time_series( self.recall_costs = np.zeros(3) recall_costs = recall_card_revlog.groupby(by="review_rating")[ "review_duration" - ].mean() + ].median() self.recall_costs[recall_costs.index - 2] = recall_costs / 1000 self.state_sequence = np.array(df["review_state"]) self.duration_sequence = np.array(df["review_duration"]) self.learn_cost = round( - df[df["review_state"] == Learning]["review_duration"].sum() - / len(df["card_id"].unique()) + df[df["review_state"] == Learning].groupby("card_id").agg({"review_duration": "sum"})[ + "review_duration" + ].median() / 1000, 1, ) From e718ad1a3ab300cb163afadfc3cb9d9da15ed50c Mon Sep 17 00:00:00 2001 From: Jarrett Ye Date: Thu, 25 Apr 2024 15:40:19 +0800 Subject: [PATCH 2/6] use median in calculating forget cost --- src/fsrs_optimizer/fsrs_optimizer.py | 48 ++++++++++++---------------- 1 file changed, 20 insertions(+), 28 deletions(-) diff --git a/src/fsrs_optimizer/fsrs_optimizer.py b/src/fsrs_optimizer/fsrs_optimizer.py index 06bb0d9..4dea9d7 100644 --- a/src/fsrs_optimizer/fsrs_optimizer.py +++ b/src/fsrs_optimizer/fsrs_optimizer.py @@ -605,9 +605,10 @@ def create_time_series( self.state_sequence = np.array(df["review_state"]) self.duration_sequence = np.array(df["review_duration"]) self.learn_cost = round( - df[df["review_state"] == Learning].groupby("card_id").agg({"review_duration": "sum"})[ - "review_duration" - ].median() + df[df["review_state"] == Learning] + .groupby("card_id") + .agg({"review_duration": "sum"})["review_duration"] + .median() / 1000, 1, ) @@ -1186,34 +1187,25 @@ def find_optimal_retention( verbose=True, ): """should not be called before predict_memory_states""" - recall_cost = 8 - forget_cost = 25 - - state_block = dict() - state_count = dict() - state_duration = dict() - + state_durations = dict() last_state = self.state_sequence[0] - state_block[last_state] = 1 - state_count[last_state] = 1 - state_duration[last_state] = self.duration_sequence[0] - for i, state in enumerate(self.state_sequence[1:]): - state_count[state] = state_count.setdefault(state, 0) + 1 - state_duration[state] = ( - state_duration.setdefault(state, 0) + self.duration_sequence[i] - ) - if state != last_state: - state_block[state] = state_block.setdefault(state, 0) + 1 + state_durations[last_state] = [self.duration_sequence[0]] + for i, state in enumerate(self.state_sequence[1:], start=1): + if state not in state_durations: + state_durations[state] = [] + if state == Review: + state_durations[state].append(self.duration_sequence[i]) + else: + if state == last_state: + state_durations[state][-1] += self.duration_sequence[i] + else: + state_durations[state].append(self.duration_sequence[i]) last_state = state - recall_cost = round(state_duration[Review] / state_count[Review] / 1000, 1) - - if Relearning in state_count and Relearning in state_block: - forget_cost = round( - state_duration[Relearning] / state_block[Relearning] / 1000 - + recall_cost, - 1, - ) + recall_cost = round(np.median(state_durations[Review]) / 1000, 1) + forget_cost = round( + np.median(state_durations[Relearning]) / 1000 + recall_cost, 1 + ) if verbose: tqdm.write(f"average time for failed reviews: {forget_cost}s") tqdm.write(f"average time for recalled reviews: {recall_cost}s") From c75ef1de714231fdec860c894e46fe514bd40290 Mon Sep 17 00:00:00 2001 From: Jarrett Ye Date: Thu, 25 Apr 2024 21:02:03 +0800 Subject: [PATCH 3/6] exclude zero --- src/fsrs_optimizer/fsrs_optimizer.py | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/src/fsrs_optimizer/fsrs_optimizer.py b/src/fsrs_optimizer/fsrs_optimizer.py index 4dea9d7..00add2f 100644 --- a/src/fsrs_optimizer/fsrs_optimizer.py +++ b/src/fsrs_optimizer/fsrs_optimizer.py @@ -597,15 +597,22 @@ def create_time_series( ) self.recall_costs = np.zeros(3) + recall_card_revlog = recall_card_revlog[ + recall_card_revlog["review_duration"] > 0 + ] recall_costs = recall_card_revlog.groupby(by="review_rating")[ "review_duration" ].median() self.recall_costs[recall_costs.index - 2] = recall_costs / 1000 - self.state_sequence = np.array(df["review_state"]) - self.duration_sequence = np.array(df["review_duration"]) + self.state_sequence = np.array( + df[df["review_duration"] > 0]["review_state"] + ) + self.duration_sequence = np.array( + df[df["review_duration"] > 0]["review_duration"] + ) self.learn_cost = round( - df[df["review_state"] == Learning] + df[(df["review_state"] == Learning) & (df["review_duration"] > 0)] .groupby("card_id") .agg({"review_duration": "sum"})["review_duration"] .median() @@ -1127,7 +1134,9 @@ def preview_sequence(self, test_rating_sequence: str, requestRetention: float): ( f"{ivl}d" if ivl < 30 - else f"{ivl / 30:.1f}m" if ivl < 365 else f"{ivl / 365:.1f}y" + else f"{ivl / 30:.1f}m" + if ivl < 365 + else f"{ivl / 365:.1f}y" ) for ivl in map(int, t_history.split(",")) ] @@ -1174,9 +1183,9 @@ def predict_memory_states(self): self.difficulty_distribution_padding = np.zeros(10) for i in range(10): if i + 1 in self.difficulty_distribution.index: - self.difficulty_distribution_padding[i] = ( - self.difficulty_distribution.loc[i + 1] - ) + self.difficulty_distribution_padding[ + i + ] = self.difficulty_distribution.loc[i + 1] return self.difficulty_distribution def find_optimal_retention( From 231670962fe4163ea0b0d782cb9dcd0fe75d9ebd Mon Sep 17 00:00:00 2001 From: Jarrett Ye Date: Thu, 25 Apr 2024 21:05:05 +0800 Subject: [PATCH 4/6] fix format --- src/fsrs_optimizer/fsrs_optimizer.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/src/fsrs_optimizer/fsrs_optimizer.py b/src/fsrs_optimizer/fsrs_optimizer.py index 00add2f..2c7abb6 100644 --- a/src/fsrs_optimizer/fsrs_optimizer.py +++ b/src/fsrs_optimizer/fsrs_optimizer.py @@ -1134,9 +1134,7 @@ def preview_sequence(self, test_rating_sequence: str, requestRetention: float): ( f"{ivl}d" if ivl < 30 - else f"{ivl / 30:.1f}m" - if ivl < 365 - else f"{ivl / 365:.1f}y" + else f"{ivl / 30:.1f}m" if ivl < 365 else f"{ivl / 365:.1f}y" ) for ivl in map(int, t_history.split(",")) ] @@ -1183,9 +1181,9 @@ def predict_memory_states(self): self.difficulty_distribution_padding = np.zeros(10) for i in range(10): if i + 1 in self.difficulty_distribution.index: - self.difficulty_distribution_padding[ - i - ] = self.difficulty_distribution.loc[i + 1] + self.difficulty_distribution_padding[i] = ( + self.difficulty_distribution.loc[i + 1] + ) return self.difficulty_distribution def find_optimal_retention( From 9d6b23c56ea79bc00e5c140ece5eb8043f05a6b7 Mon Sep 17 00:00:00 2001 From: Jarrett Ye Date: Thu, 25 Apr 2024 21:08:23 +0800 Subject: [PATCH 5/6] bump version --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 90dd46e..2b1012c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "FSRS-Optimizer" -version = "4.28.1" +version = "4.28.2" readme = "README.md" dependencies = [ "matplotlib>=3.7.0", From 813f12b8dbaa6118f5d94d5952c30a9484c71723 Mon Sep 17 00:00:00 2001 From: Jarrett Ye Date: Thu, 25 Apr 2024 22:08:00 +0800 Subject: [PATCH 6/6] exclude >20min --- src/fsrs_optimizer/fsrs_optimizer.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/src/fsrs_optimizer/fsrs_optimizer.py b/src/fsrs_optimizer/fsrs_optimizer.py index 2c7abb6..c6943d1 100644 --- a/src/fsrs_optimizer/fsrs_optimizer.py +++ b/src/fsrs_optimizer/fsrs_optimizer.py @@ -598,7 +598,8 @@ def create_time_series( self.recall_costs = np.zeros(3) recall_card_revlog = recall_card_revlog[ - recall_card_revlog["review_duration"] > 0 + (recall_card_revlog["review_duration"] > 0) + & (df["review_duration"] < 1200000) ] recall_costs = recall_card_revlog.groupby(by="review_rating")[ "review_duration" @@ -606,13 +607,21 @@ def create_time_series( self.recall_costs[recall_costs.index - 2] = recall_costs / 1000 self.state_sequence = np.array( - df[df["review_duration"] > 0]["review_state"] + df[(df["review_duration"] > 0) & (df["review_duration"] < 1200000)][ + "review_state" + ] ) self.duration_sequence = np.array( - df[df["review_duration"] > 0]["review_duration"] + df[(df["review_duration"] > 0) & (df["review_duration"] < 1200000)][ + "review_duration" + ] ) self.learn_cost = round( - df[(df["review_state"] == Learning) & (df["review_duration"] > 0)] + df[ + (df["review_state"] == Learning) + & (df["review_duration"] > 0) + & (df["review_duration"] < 1200000) + ] .groupby("card_id") .agg({"review_duration": "sum"})["review_duration"] .median()