In [27]:
# !pip install japanize-matplotlib
# !pip install reportlab
# !pip install --upgrade reportlab

In [28]:
# -----------------------------------------------------------
# 1) Imports & Colab authentication
# -----------------------------------------------------------
import logging
import os
from datetime import datetime, timedelta
from typing import Dict, List

import japanize_matplotlib  # enables Japanese font rendering
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from google.colab import auth, drive
from google.auth import default
import gspread
from reportlab.lib import colors
from reportlab.lib.pagesizes import A4
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.platypus import (
    Image as RLImage,
    PageBreak,
    Paragraph,
    SimpleDocTemplate,
    Spacer,
    Table,
    TableStyle,
)

# -----------------------------------------------------------
# 2) CONFIG –- tweak here for next month
# -----------------------------------------------------------
SPREADSHEET_NAME = "ga4_seo_lp_report_2025"   # GA4 export workbook

GROUPA_URLS = [
    "/",
    "/profile.html",
    "/tool",
    "/profile",
]

GROUPB_URLS = [
    "/en",
    "/en/profile.html",
    "/en/tool",
    "/en/profile",
]

# **New**: human-readable names for your two groups
GROUPA_NAME = "Group A"
GROUPB_NAME = "Group B"

# Metrics to analyse: GA4 column-name → pretty label
#  (add/remove freely; order matters for output)
METRICS: Dict[str, str] = {
    "screenPageViews": "Pageviews",
    "sessions": "Sessions",
    "engagedSessions": "Engaged Sessions",
    "newUsers": "New Users",
}
# derive a simple list of the *pretty* labels in order:
METRIC_LABELS: List[str] = list(METRICS.values())

# if you want to customize the aggregation per metric,
# define that here too (defaults to sum):
COMPARISON_AGGS: Dict[str,str] = { m: "sum" for m in METRIC_LABELS }

# -----------------------------------------------------------
# 3) DERIVED CONFIG – *must* come after METRICS ❶
# -----------------------------------------------------------
metric_notes = {
    "Pageviews": "Number of pageviews from Organic Search Sessions.",
    "Sessions": "Number of sessions initiated from Organic Search.",
    "Engaged Sessions": "Sessions with significant engagement (e.g., multiple page views or ≥10 s).",
    # "Add To Cart": "Count of add-to-cart events.",
    # "Ecommerce Purchase": "Number of purchase events.",
    "New Users": "First-time users visiting the site.",
    # "First Time Purchasers": "Users who completed their first purchase."
}

RENAME_MAP = {ga: pretty for ga, pretty in METRICS.items()}

# Month ordering helper
MONTH_ORDER = {9:1,10:2,11:3,12:4,1:5,2:6,3:7,4:8,5:9,6:10,7:11,8:12}

# Comparison-window (fiscal year in your case)
COMP_START_MONTH = 9
COMP_END_MONTH   = 5

DATA_START = "2023-01-01"
DATA_END   = "2025-05-31"        # padded “04” avoids ambiguous dates ❷

BAR_COLORS = {"FY23": "#BFE9DB", "FY24": "#6AC1B7", "FY25": "#264E86"}

# Optional annotations per group (if you ever want to draw a line/text)
ANNOTATIONS: Dict[str, Dict[str, object]] = {}
# ANNOTATIONS: Dict[str, Dict[str, object]] = {
#     GROUPA_NAME: {"year": 2025, "month": 3, "text": "Launched new UI"},
#     GROUPB_NAME: {"year": 2024, "month": 11, "text": "A/B test started"},
# }

In [29]:
# -----------------------------------------------------------
# 3) Start‑up (mount Drive, auth Sheets)
# -----------------------------------------------------------
drive.mount("/content/drive", force_remount=True)
auth.authenticate_user()
creds, _ = default()
GC = gspread.authorize(creds)

Mounted at /content/drive


In [30]:
# -----------------------------------------------------------
# 4) Load GA4 data
# -----------------------------------------------------------


def list_sheet_names(start: str, end: str) -> List[str]:
    return [f"df_{p.strftime('%Y%m')}" for p in pd.period_range(start, end, freq="M")]

def load_ga4_data() -> pd.DataFrame:
    ss = GC.open(SPREADSHEET_NAME)
    frames: List[pd.DataFrame] = []
    for sht in list_sheet_names(DATA_START, DATA_END):
        try:
            vals = ss.worksheet(sht).get_all_values()[14:]
            if vals:
                frames.append(pd.DataFrame(vals[1:], columns=vals[0]))
        except gspread.exceptions.WorksheetNotFound:
            logging.warning("Sheet %s missing; skipping", sht)
    if not frames:
        raise RuntimeError("No data found in Sheets range")

    df = pd.concat(frames, ignore_index=True)
    df = (
        df.rename(columns=RENAME_MAP)
          .assign(yearMonth=lambda d: pd.to_datetime(d["yearMonth"], format="%Y%m"))
          # .drop(columns=["averageSessionDuration"], errors="ignore")
    )
    for nice in RENAME_MAP.values():
        df[nice] = pd.to_numeric(df[nice], errors="coerce").fillna(0).astype(int)
    df = df[(df["yearMonth"] >= DATA_START) & (df["yearMonth"] <= DATA_END)].copy()
    df["fiscal_year"] = df["yearMonth"].apply(lambda d: d.year + 1 if d.month >= 9 else d.year)
    df["month"] = df["yearMonth"].dt.month
    return df

# -----------------------------------------------------------
# 5) Helpers (tables, charts)
# -----------------------------------------------------------

def sort_month(df: pd.DataFrame) -> pd.DataFrame:
    return df.assign(_k=df["month"].map(MONTH_ORDER)).sort_values("_k").drop("_k", axis=1)

def monthly_table(df_sub: pd.DataFrame, metric: str) -> pd.DataFrame:
    t = (
        df_sub.groupby(["fiscal_year","month"], as_index=False)[metric].sum()
              .pivot(index="month", columns="fiscal_year", values=metric)
              .reset_index()
    )
    t = sort_month(t).set_index("month").rename(columns=lambda c: f"FY{str(c)[-2:]}" if isinstance(c,int) else c)
    for fy in ("FY23","FY24","FY25"):
        t[fy] = t.get(fy,0).fillna(0).astype(int)
    t["YoY"] = t.apply(lambda r: "-" if r["FY24"]==0 else f"{(r['FY25']/r['FY24']-1):.0%}", axis=1)
    t["MoM"] = t["FY25"].pct_change().apply(lambda x: "-" if pd.isna(x) else f"{x:.0%}")
    return t

def save_bar_chart(df_sub: pd.DataFrame, pretty_metric: str, label: str) -> str:
    tbl = monthly_table(df_sub, pretty_metric)
    idx = np.arange(len(tbl))
    width = 0.25
    plt.figure(figsize=(6,6))
    for i, fy in enumerate(["FY23","FY24","FY25"]):
        plt.bar(idx+(i-1)*width, tbl[fy], width=width, color=BAR_COLORS[fy], label=fy)

    # optional annotation
    ann = ANNOTATIONS.get(label)
    if ann and ann["month"] in tbl.index:
        fy_line = ann["year"] + 1 if ann["month"] >= 9 else ann["year"]
        col = f"FY{str(fy_line)[-2:]}"
        offset = ["FY23","FY24","FY25"].index(col) - 1 if col in BAR_COLORS else 0
        xpos = idx[tbl.index==ann["month"]][0] + offset*width
        plt.axvline(x=xpos, color="blue", ls="--")
        plt.text(xpos, tbl[["FY23","FY24","FY25"]].max().max()*0.9, ann["text"], rotation=90)

    plt.xticks(idx, tbl.index)
    plt.ylabel(pretty_metric)
    plt.title(f"{pretty_metric} – {label}")
    plt.legend()
    plt.tight_layout()
    fname = f"{pretty_metric.replace(' ','_')}_{label}.png"
    plt.savefig(fname)
    plt.close()
    return fname


# -----------------------------------------------------------
# 6) ReportLab helpers
# -----------------------------------------------------------
styles = getSampleStyleSheet()

def rl_table(df_tbl: pd.DataFrame) -> Table:
    df_tbl = df_tbl.reset_index()
    fmt = lambda v: f"{v:,}" if isinstance(v,(int,float)) else v
    data = [list(df_tbl.columns)] + [[fmt(x) for x in row] for row in df_tbl.values]
    tbl = Table(data)
    tbl.setStyle(TableStyle([
        ("BACKGROUND",(0,0),(-1,0),colors.HexColor("#517D99")),
        ("TEXTCOLOR",(0,0),(-1,0),colors.white),
        ("ALIGN",(0,0),(-1,-1),"CENTER"),
        ("FONTSIZE",(0,0),(-1,-1),8),
        ("GRID",(0,0),(-1,-1),0.5,colors.black),
    ]))
    return tbl

# -----------------------------------------------------------
# 7-a) Helper – comparison tables  (fixed)
# -----------------------------------------------------------
def add_comparison_section(elements,
                           df_page1: pd.DataFrame,
                           df_page2: pd.DataFrame):
    """
    Adds two FY24 vs FY25 comparison tables (page1 & page2)
    to `elements`.  Month window is set by COMP_START_MONTH /
    COMP_END_MONTH.
    """

    # For heading text only
    if COMP_START_MONTH <= COMP_END_MONTH:
        period_text = f"{COMP_START_MONTH}–{COMP_END_MONTH}"
    else:
        period_text = f"{COMP_START_MONTH}–12 & 1–{COMP_END_MONTH}"

    # swap out the inline dict for the config one:
    metrics = COMPARISON_AGGS

    def _build_table(df_sub: pd.DataFrame) -> Table:
        # --- build mask with *this* dataframe ----------------
        if COMP_START_MONTH <= COMP_END_MONTH:
            mask = (df_sub["month"] >= COMP_START_MONTH) & (df_sub["month"] <= COMP_END_MONTH)
        else:  # wraps over year end
            mask = (df_sub["month"] >= COMP_START_MONTH) | (df_sub["month"] <= COMP_END_MONTH)
        # -----------------------------------------------------

        fy24 = df_sub[(df_sub["fiscal_year"] == 2024) & mask]
        fy25 = df_sub[(df_sub["fiscal_year"] == 2025) & mask]

        rows = [["Metrics", "FY24", "FY25", "YoY"]]
        for m, agg in metrics.items():
            v24 = fy24[m].sum() if agg == "sum" else fy24[m].mean()
            v25 = fy25[m].sum() if agg == "sum" else fy25[m].mean()
            yoy  = "N/A" if v24 == 0 else f"{(v25 / v24 - 1):+.0%}"
            rows.append([m, f"{int(v24):,}", f"{int(v25):,}", yoy])

        tbl = Table(rows, colWidths=[150, 100, 100, 80])
        tbl.setStyle(TableStyle([
            ("BACKGROUND", (0, 0), (-1, 0), colors.HexColor("#517D99")),
            ("TEXTCOLOR",  (0, 0), (-1, 0), colors.white),
            ("ALIGN",      (0, 0), (-1, -1), "CENTER"),
            ("FONTNAME",   (0, 0), (-1, 0), "Helvetica-Bold"),
            ("FONTSIZE",   (0, 0), (-1, 0), 9),
            ("BACKGROUND", (0, 1), (-1, -1), colors.whitesmoke),
            ("GRID",       (0, 0), (-1, -1), 0.5, colors.black),
        ]))
        return tbl

    # ---------- section header ----------
    elements.append(PageBreak())
    elements.append(Paragraph(f"3. Comparison Table (Months {period_text})",
                              styles["Heading2"]))
    elements.append(Spacer(1, 6))

    # ---------- Page-1 ----------
    elements.append(Paragraph(f"3-1. Performance – {GROUPA_NAME}", styles["Heading3"]))
    elements.append(_build_table(df_page1))
    elements.append(Spacer(1, 18))

    # ---------- Page-2 ----------
    elements.append(Paragraph(f"3-2. Performance – {GROUPB_NAME}", styles["Heading3"]))
    elements.append(_build_table(df_page2))
    elements.append(Spacer(1, 24))



# -----------------------------------------------------------
# 7) Build PDF
# -----------------------------------------------------------

def add_metric_section(elements, title: str, df_sub: pd.DataFrame, page_url: str):
    elements.append(Paragraph(title, styles["Heading2"]))
    for metric in METRIC_LABELS:
        tbl = monthly_table(df_sub, metric)
        img = save_bar_chart(df_sub, metric, page_url)
        elements.extend([
            Paragraph(metric, styles["Heading3"]),
            Table([[rl_table(tbl), RLImage(img, width=260, height=260)]], colWidths=[240,310]),
            Spacer(1,30)
        ])

def build_pdf(df: pd.DataFrame):
    pdf_filename = f"ga_lp_group_comparison_{datetime.now():%Y%m%d}.pdf"
    doc = SimpleDocTemplate(pdf_filename, pagesize=A4,
                            leftMargin=30, rightMargin=30)
    els = []

    # ---------- Cover ----------
    last_day_prev_month = (datetime.now().replace(day=1) - timedelta(days=1)).strftime('%Y-%m-%d')
    els += [
        Spacer(1, A4[1]/2 - 130),
        Paragraph("Organic Search Landing Page Report", styles["Title"]),
        Spacer(1, 12),
        Paragraph(f"Comparing {GROUPA_NAME} vs {GROUPB_NAME}", styles["Title"]),
        Spacer(1, 12),
        Paragraph(datetime.now().strftime("%Y-%m"), styles["Title"]),
        Spacer(1, 170),
        Paragraph("Created by: Shohei", styles["BodyText"]),
        Paragraph("Website: heysho.com", styles["BodyText"]),
        Paragraph("Filter: Organic Search Sessions", styles["BodyText"]),
        Paragraph("Data Source: Google Analytics 4", styles["BodyText"]),
        Paragraph(f"Data Range: {DATA_START} – {last_day_prev_month}", styles["BodyText"]),
        PageBreak(),
    ]

    # ---------- Performance trend pages ----------
    df_groupA = df[df["landingPage"].isin(GROUPA_URLS)]
    df_groupB = df[df["landingPage"].isin(GROUPB_URLS)]

    # ---------- Performance trend pages ----------
    add_metric_section(
        els,
        f"1. Performance Trend – {GROUPA_NAME}",
        df_groupA,
        GROUPA_NAME
    )

    els.append(PageBreak())
    add_metric_section(
        els,
        f"2. Performance Trend – {GROUPB_NAME}",
        df_groupB,
        GROUPB_NAME
    )

    # ---------- Comparison tables ----------
    add_comparison_section(els, df_groupA, df_groupB)

    # ---------- NEW: Notes page ----------
    els.append(PageBreak())
    els += [
        Paragraph("4. Notes", styles["Heading2"]),
        Spacer(1, 6),
        Paragraph(
            "This document contains confidential information intended solely for the designated recipients.",
            styles['BodyText']),
        Paragraph(
            "Unauthorized use, disclosure, or copying of this document is strictly prohibited.",
            styles['BodyText']),
        Spacer(1, 30),
        Paragraph("5. Explanation of Metrics", styles["Heading2"]),
        Spacer(1, 6),
    ]

    for k, v in metric_notes.items():
        els.append(Paragraph(f"<bullet>&bull;</bullet> <b>{k}</b>: {v}", styles["BodyText"]))
        els.append(Spacer(1, 5))

    # --- New Section: List of URLs by Group ---]
    els.append(Spacer(1,30))
    small_list_style = ParagraphStyle(
        name='SmallList', parent=styles['BodyText'], fontSize=8, leading=10,
        spaceBefore=0, spaceAfter=0, leftIndent=16
    )
    els.append(Paragraph("6. List of URLs by Group", styles['Heading2']))
    els.append(Spacer(1,3))
    # Group A
    els.append(Paragraph(f"6-1. {GROUPA_NAME}", styles['Heading3']))
    for url in GROUPA_URLS:
        els.append(Paragraph(f"• {url}", small_list_style))
    els.append(Spacer(1,12))
    # Group B
    els.append(Paragraph(f"6-2. {GROUPB_NAME}", styles['Heading3']))
    for url in GROUPB_URLS:
        els.append(Paragraph(f"• {url}", small_list_style))
    els.append(Spacer(1,12))




    # ---------- Build ----------
    doc.build(els)
    logging.info("PDF generated → %s", pdf_filename)



In [31]:
# -----------------------------------------------------------
# 8) Run end‑to‑end
# -----------------------------------------------------------
try:
    df_all = load_ga4_data()
    build_pdf(df_all)
except Exception as err:
    logging.error("Process failed: %s", err)