Make output checking top5 codes over time easier

1. Include code label and total count of codes in the top codes over time csv 2. Round code proportion to 2 decimals 3. Switch order of code proportion table creation and plotting, so that we first make the table, and then use that table to generate the figure. This will also make it possible to locally in python re-generate the plot with the underlying data.
opensafely · Sep 19, 2023 · 0fcc6b2 · 0fcc6b2
1 parent 57fc927
commit 0fcc6b2
Showing 1 changed file with 34 additions and 17 deletions.
diff --git a/analysis/report/top_5_report.py b/analysis/report/top_5_report.py
@@ -227,25 +227,23 @@ def get_proportion_of_events(df, code, date):
     if total_events == 0:
         row["proportion"] = 0
     else:
-        row["proportion"] = (code_events / total_events) * 100
+        row["proportion"] = round((code_events / total_events) * 100, 2)
+        row["total"] = total_events
         row["count"] = code_events
 
     return row
 
 
-def plot_top_codes_over_time(
-    code_df, top_codes, measure, output_dir, frequency, xtick_frequency=1
-):
+def make_top_codes_over_time_dict(code_df, top_codes, measure, output_dir):
     """
-    Plots the top 5 codes over time for each measure.
+    Creates and saves df with proportion of each of the top codes
     Args:
         code_df: A codelist table.
         top_codes: A dictionary of the top codes for each measure with corresponding descriptions.
         measure: The measure ID.
-        output_dir: The directory to save the plot to.
+        output_dir: The directory to save the csv to.
 
     """
-
     # Create a new dataframe with the proportion of events for each code on
     # each date.
     code_proportions = pd.DataFrame()
@@ -261,7 +259,27 @@ def plot_top_codes_over_time(
                 ),
             ]
         )
+    code_proportions["name"] = code_proportions["code"].map(top_codes)
 
+    # save the underlying data for the plot
+    code_proportions.to_csv(
+        output_dir / f"{measure}_top_5_codes_over_time.csv", index=False
+    )
+    return code_proportions
+
+
+def plot_top_codes_over_time(
+    code_proportions, measure, output_dir, frequency, xtick_frequency=1
+):
+    """
+    Plots the top 5 codes over time for each measure.
+    Args:
+        code_proportions: a table of top code proportions.
+        measure: The measure ID.
+        output_dir: The directory to save the plot to.
+        frequency: month or week
+        xtick_frequency: interval to display x-axis tick label
+    """
     plt.figure(figsize=(10, 6))
     # seaborn styling
     sns.set_style("whitegrid")
@@ -274,10 +292,9 @@ def plot_top_codes_over_time(
         code_proportions["date"] = pd.to_datetime(code_proportions["date"])
         # Plot the proportion of events for each code on each date.
         # Plots should be on the same graph.
-        for code, description in top_codes.items():
-            code_proportions.loc[code_proportions["code"] == code, :].plot(
-                x="date", y="count", label=description, ax=ax
-            )
+        groups = code_proportions.groupby("code")
+        for code, data in groups:
+            data.plot(x="date", y="count", label=data.iloc[0]["name"], ax=ax)
 
         if frequency == "month":
             xticks = pd.date_range(
@@ -311,11 +328,6 @@ def plot_top_codes_over_time(
         bbox_inches="tight",
     )
 
-    # save the underlying data for the plot
-    code_proportions.to_csv(
-        output_dir / f"{measure}_top_5_codes_over_time.csv", index=False
-    )
-
 
 def main():
     args = parse_args()
@@ -407,11 +419,16 @@ def main():
         )
 
         # plot the top codes over time
-        plot_top_codes_over_time(
+        code_proportions = make_top_codes_over_time_dict(
             code_df=code_df,
             top_codes=top_codes_dict,
             measure=measure,
             output_dir=output_dir,
+        )
+        plot_top_codes_over_time(
+            code_proportions=code_proportions,
+            measure=measure,
+            output_dir=output_dir,
             frequency=frequency,
             xtick_frequency=xtick_frequency,
         )