In [6]:
!pip install japanize-matplotlib
!pip install reportlab
!pip install --upgrade reportlab



In [7]:
import os
import calendar
from datetime import datetime, timedelta

# -- Google Colab & Google Sheets --
from google.colab import auth, drive, userdata
import gspread
from google.auth import default

# -- Data Analysis & Visualization --
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import ScalarFormatter
import japanize_matplotlib

# -- PDF Generation --
from reportlab.lib.pagesizes import A4
from reportlab.platypus import (
    SimpleDocTemplate,
    Paragraph,
    Spacer,
    Table,
    TableStyle,
    Image,
    PageBreak
)
from reportlab.lib.styles import getSampleStyleSheet
from reportlab.lib import colors
from reportlab.pdfbase import pdfmetrics
from reportlab.pdfbase.cidfonts import UnicodeCIDFont

# ------------------------------------------------------------------------------
# 1. Google Drive & Google Sheets Authentication
# ------------------------------------------------------------------------------
def initialize_colab_and_gsheets():
    """
    Mount Google Drive, authenticate the user,
    and return a gspread client for accessing Google Sheets.
    """
    drive.mount('/content/drive', force_remount=True)
    auth.authenticate_user()
    creds, _ = default()
    return gspread.authorize(creds)

gc = initialize_colab_and_gsheets()

# ------------------------------------------------------------------------------
# 2. Config: Titles & Filenames
# ------------------------------------------------------------------------------
TITLES = {
    "1_df_heysho.com_gsc_daily_total": "All Pages",
    "2_df_heysho.com_gsc_daily_jp": "Japanese Language Pages",
    "3_df_heysho.com_gsc_daily_en": "English Language Pages",
}

# The folder path if needed (e.g., listing local files);
# for your reference if you want to dynamically find file names:
FOLDER_PATH = "/content/drive/My Drive/google-colab/digital-marketing-report-generator/gsc-page-group-report/data"

# For simplicity, we’re using the keys from TITLES directly as filenames
FILENAMES = list(TITLES.keys())


Mounted at /content/drive


In [8]:

# ------------------------------------------------------------------------------
# 3. Data Loading
# ------------------------------------------------------------------------------
def load_sheet_data(spreadsheet, sheet_name):
    """
    Load data from a specific worksheet into a Pandas DataFrame.
    Assumes the first row is the header.
    """
    worksheet = spreadsheet.worksheet(sheet_name)
    data = worksheet.get_all_values()
    return pd.DataFrame(data[1:], columns=data[0])

def load_dataframes(filenames):
    """
    Open each Google Sheet (by filename), load the data,
    and return a dictionary of {filename: DataFrame}.
    """
    data_dict = {}
    for filename in filenames:
        ss = gc.open(filename)
        data_dict[filename] = load_sheet_data(ss, filename)
    return data_dict

df_hub = load_dataframes(FILENAMES)



In [11]:

# ------------------------------------------------------------------------------
# 4. Data Preprocessing
# ------------------------------------------------------------------------------
def df_preprocess(df: pd.DataFrame) -> pd.DataFrame:
    """
    Preprocess the DataFrame by:
      1. Ensuring 'Date' column exists.
      2. Dropping unnecessary columns (CTR, Position).
      3. Converting numeric columns (removing commas).
      4. Converting the 'Date' column to datetime.
      5. Adding columns (year, month, week, fiscal_year).
    Returns a cleaned DataFrame.
    """
    if 'Date' not in df.columns:
        raise KeyError("The dataframe is missing the 'Date' column.")

    # Drop columns we don't need
    columns_to_drop = ['CTR', 'Position']
    for col in columns_to_drop:
        if col in df.columns:
            df.drop(columns=[col], inplace=True)

    # Convert columns to numeric
    excluded_cols = ['Date']
    for col in df.columns:
        if col not in excluded_cols:
            df[col] = pd.to_numeric(
                df[col].astype(str).str.replace(',', ''), errors='coerce'
            )

    # Convert Date column to datetime
    df['Date'] = pd.to_datetime(df['Date'], format='%Y-%m-%d')

    # Add year, month, week
    df['year'] = df['Date'].dt.year
    df['month'] = df['Date'].dt.month
    df['week'] = df['Date'].dt.isocalendar().week

    # Define fiscal year based on the date range
    df['fiscal_year'] = df['Date'].apply(lambda x:
        'FY23' if x < pd.Timestamp('2023-06-01') else
        'FY24' if pd.Timestamp('2023-06-01') <= x <= pd.Timestamp('2024-05-31') else
        'FY25' if pd.Timestamp('2024-06-01') <= x <= pd.Timestamp('2025-05-31') else
        None
    )
    return df

def preprocess_all_dataframes(df_dict):
    """
    Apply df_preprocess to each DataFrame in a dictionary.
    Returns a new dictionary with preprocessed DataFrames.
    """
    processed = {}
    for key, dataframe in df_dict.items():
        if dataframe.empty:
            continue
        processed[key] = df_preprocess(dataframe)
    return processed

df_hub_processed = preprocess_all_dataframes(df_hub)

# ------------------------------------------------------------------------------
# 5. Aggregation & Sorting
# ------------------------------------------------------------------------------
def custom_month_sort(df: pd.DataFrame) -> pd.DataFrame:
    """
    Sort the DataFrame by month in a custom order (Sep to Aug),
    storing the custom index in 'sort_key' and removing it after sorting.
    """
    month_order = {
        6: 1, 7: 2, 8: 3, 9: 4,
        10: 5, 11: 6, 12: 7, 1: 8,
        2: 9, 3: 10, 4: 11, 5: 12
    }
    df['sort_key'] = df['month'].map(month_order)
    df = df.sort_values(by='sort_key').drop('sort_key', axis=1)
    return df

def generate_monthly_table(metric: str, df: pd.DataFrame) -> pd.DataFrame:
    """
    Create a pivot table for the specified metric, grouped by fiscal_year and month.
    Returns a DataFrame with columns: FY24, FY25, YoY, MoM.
    """
    # Aggregate
    df_agg = df.groupby(['fiscal_year', 'month'])[metric].sum().reset_index()

    # Pivot
    df_pivot = df_agg.pivot_table(
        index='month',
        columns='fiscal_year',
        values=metric,
        fill_value=0
    ).reset_index()

    # Sort by custom month order
    df_pivot = custom_month_sort(df_pivot)
    df_pivot.set_index('month', inplace=True)

    # Drop FY23 if it exists
    if 'FY23' in df_pivot.columns:
        df_pivot.drop(columns='FY23', inplace=True)

    # Ensure integer type
    for fy in ['FY24', 'FY25']:
        if fy not in df_pivot.columns:
            df_pivot[fy] = 0
        df_pivot[fy] = df_pivot[fy].fillna(0).astype(int)

    # YoY
    def yoy_calc(row):
        if row['FY24'] == 0 or row['FY25'] == 0:
            return '-'
        return f"{int(round((row['FY25'] / row['FY24'] - 1) * 100))}%"
    df_pivot['YoY'] = df_pivot.apply(yoy_calc, axis=1)

    # MoM for FY25
    df_pivot['MoM'] = df_pivot['FY25'].pct_change() * 100

    # First month’s MoM: compare with last month of FY24
    first_month_index = df_pivot.index[0]
    last_month_index = df_pivot.index[-1]
    previous_value = df_pivot.loc[last_month_index, 'FY24']
    current_value = df_pivot.loc[first_month_index, 'FY25']

    if previous_value == 0:
        df_pivot.loc[first_month_index, 'MoM'] = np.nan
    else:
        df_pivot.loc[first_month_index, 'MoM'] = ((current_value - previous_value) / previous_value) * 100

    # Format MoM
    df_pivot['MoM'] = df_pivot['MoM'].apply(lambda x: '-' if not np.isfinite(x) else f"{int(round(x))}%")

    return df_pivot

# ------------------------------------------------------------------------------
# 6. Visualization
# ------------------------------------------------------------------------------
def generate_monthly_graph(metric: str, df: pd.DataFrame, df_key: str):
    """
    For the given DataFrame, aggregates the specified metric (e.g. 'Clicks') by
    fiscal_year and month, then plots a bar chart comparing FY24 vs. FY25.
    Returns the path of the saved PNG image.
    """
    # Aggregate
    df_agg = df.groupby(['fiscal_year', 'month'])[metric].sum().reset_index()

    # Pivot
    df_pivot = df_agg.pivot_table(
        index='month',
        columns='fiscal_year',
        values=metric,
        fill_value=0
    ).reset_index()

    # Custom month sort
    df_pivot = custom_month_sort(df_pivot)
    df_pivot.set_index('month', inplace=True)

    # Ensure integer type
    for fy in ['FY24', 'FY25']:
        if fy not in df_pivot.columns:
            df_pivot[fy] = 0
        df_pivot[fy] = pd.to_numeric(df_pivot[fy], errors='coerce').fillna(0).astype(int)

    # Plot
    plt.figure(figsize=(6, 6))
    bar_width = 0.35
    x_positions = np.arange(len(df_pivot.index))

    plt.bar(x_positions, df_pivot['FY24'], width=bar_width, color='#A3C8F0', label='FY24')
    plt.bar(x_positions + bar_width, df_pivot['FY25'], width=bar_width, color='#1F4E79', label='FY25')

    # Y-axis formatting
    y_formatter = ScalarFormatter(useOffset=False)
    y_formatter.set_scientific(False)
    plt.gca().yaxis.set_major_formatter(y_formatter)

    # Labels & Title
    plt.xlabel('Month')
    plt.ylabel(metric)
    plt.title(f"{TITLES.get(df_key, df_key)} - {metric}")
    plt.xticks(x_positions + bar_width / 2, [int(x) for x in df_pivot.index])

    plt.legend(loc='upper right')
    plt.grid(True, color='#999', linewidth=0.2, linestyle='--', axis='y')
    plt.tight_layout()

    # Save
    image_dir = 'images'
    os.makedirs(image_dir, exist_ok=True)
    sanitized_name = df_key.replace(" ", "_")
    image_path = os.path.join(image_dir, f"{sanitized_name}_monthly_{metric.lower()}.png")
    plt.savefig(image_path)
    plt.close()
    return image_path

# ------------------------------------------------------------------------------
# 7. Generate Graphs & Tables for each DataFrame
# ------------------------------------------------------------------------------
metrics = ['Clicks', 'Impressions']
datasets = []  # Will store (monthly_table_df, image_path, title) for PDF

# Validate all DF keys vs. TITLES
df_keys_processed = list(df_hub_processed.keys())
if len(df_keys_processed) != len(TITLES):
    print("Warning: The number of processed DataFrames does not match the number of titles.")
    print("Ensure each DataFrame has a corresponding entry in TITLES if needed.")

# Build the list of datasets
for idx, (df_key, df) in enumerate(df_hub_processed.items()):
    if df.empty:
        continue

    for metric in metrics:
        # Build a descriptive title
        title_suffix = 'a' if metric == 'Clicks' else 'b'
        base_title = TITLES.get(df_key, df_key)
        title_number = idx + 1
        combined_title = f"{title_number}-{title_suffix}. {base_title} - {metric}"

        # Generate graph
        graph_path = generate_monthly_graph(metric, df, df_key)

        # Generate table
        monthly_table = generate_monthly_table(metric, df)

        # Add tuple to datasets
        datasets.append((monthly_table, graph_path, combined_title))

# ------------------------------------------------------------------------------
# 8. PDF Creation
# ------------------------------------------------------------------------------
def create_table(data, image_path, styles):
    """
    Builds a ReportLab Table from a 2D list of data and pairs it with an image.
    Returns (table_object, image_object).
    """
    # Format numeric cells with commas
    formatted_data = []
    for row in data:
        new_row = []
        for val in row:
            if isinstance(val, (int, float)) and not isinstance(val, bool):
                new_row.append(f"{int(val):,}")
            else:
                new_row.append(val)
        formatted_data.append(new_row)

    # Build the table
    table = Table(formatted_data)
    table_style = TableStyle([
        ('BACKGROUND', (0, 0), (-1, 0), colors.HexColor('#517D99')),
        ('TEXTCOLOR', (0, 0), (-1, 0), colors.white),
        ('ALIGN', (0, 0), (-1, -1), 'CENTER'),
        ('VALIGN', (0, 0), (-1, -1), 'MIDDLE'),
        ('FONTNAME', (0, 0), (-1, -1), 'Helvetica'),
        ('FONTSIZE', (0, 0), (-1, -1), 8),
        ('LEFTPADDING', (0, 0), (-1, -1), 6),
        ('RIGHTPADDING', (0, 0), (-1, -1), 6),
        ('TOPPADDING', (0, 0), (-1, -1), 4),
        ('BOTTOMPADDING', (0, 0), (-1, -1), 4),
        ('GRID', (0, 0), (-1, -1), 0.5, colors.black)
    ])
    table.setStyle(table_style)

    # Optionally highlight month column
    month_col_index = 0  # If the first column is 'month'
    table.setStyle(TableStyle([
        ('BACKGROUND', (month_col_index, 1), (month_col_index, -1), colors.whitesmoke)
    ]))

    # Load or create image placeholder
    try:
        image = Image(image_path)
        image.drawHeight = 260
        image.drawWidth = 260
    except Exception as e:
        image = Paragraph(f"Image not found: {e}", styles['BodyText'])

    return table, image

def generate_pdf():
    """
    Generates a PDF report that includes:
      1. A cover page
      2. A table of contents
      3. Tables & images for each dataset
      4. A final note
    """
    current_date = datetime.now().strftime('%Y-%m-%d')
    pdf_path = f"gsc-page-group-report-monthly-simple-{current_date}.pdf"
    doc = SimpleDocTemplate(pdf_path, pagesize=A4)
    styles = getSampleStyleSheet()
    elements = []

    # -- Cover Page --
    # Add metadata
    current_date = datetime.now().strftime('%Y-%m-%d')

    # 現在の月の1日を取得し、そこから1日引くと前月の最終日になる
    first_day_this_month = datetime(datetime.now().year, datetime.now().month, 1)
    last_day_prev_month = first_day_this_month - timedelta(days=1)
    last_day_prev_month_str = last_day_prev_month.strftime('%Y-%m-%d')


    previous_month = datetime.now() - timedelta(days=30)
    cover_title = Paragraph('Google Search Console ', styles['Title'])
    cover_title_2 = Paragraph('Monthly Page Group Report', styles['Title'])
    cover_subtitle = Paragraph(f'{previous_month.strftime("%B %Y")}', styles['Title'])

    elements.append(Spacer(1, A4[1] / 2 - 160))
    elements.append(cover_title)
    elements.append(cover_title_2)

    elements.append(Spacer(1, 12))
    elements.append(cover_subtitle)
    elements.append(Spacer(1, 200))

    # Metadata on cover
    created_date = datetime.now().strftime('%Y-%m-%d')
    three_days_ago = (datetime.now() - timedelta(days=3)).strftime('%Y-%m-%d')
    elements.append(Paragraph(f'Created by:  Shohei on {current_date}', styles['BodyText']))
    elements.append(Paragraph(
        f'Website: heysho.com',
        styles['BodyText']
    ))
    elements.append(Paragraph('Data Source: Google Search Console', styles['BodyText']))
    elements.append(Paragraph(f'Data Range: 2023-09-01 - {three_days_ago}', styles['BodyText']))
    elements.append(PageBreak())

    # -- Table of Contents --
    elements.append(Paragraph('Table of Contents', styles['Heading1']))
    elements.append(Spacer(1, 12))

    for idx, (_, _, title) in enumerate(datasets):
        anchor_name = f'section_{idx}'
        link_text = f'<link href="#{anchor_name}">{title}</link>'
        elements.append(Paragraph(link_text, styles['BodyText']))

    elements.append(Spacer(1, 30))
    elements.append(Paragraph("**Click to jump to the page**", styles['Normal']))
    elements.append(PageBreak())

    # -- Dataset Pages --
    for idx, (table_df, img_path, section_title) in enumerate(datasets):
        # Convert DataFrame to a list-of-lists
        table_df = table_df.reset_index()
        table_data = [table_df.columns.to_list()] + table_df.values.tolist()

        # Create table & image
        t, img = create_table(table_data, img_path, styles)

        # Anchor for TOC link
        anchor_name = f'section_{idx}'
        elements.append(Paragraph(f'<a name="{anchor_name}"/>{section_title}', styles['Heading2']))
        elements.append(Spacer(1, 6))

        # Layout: table & image side by side
        col_layout = Table([[t, img]], colWidths=[230, 300], style=[
            ('ALIGN', (0, 0), (0, 0), 'RIGHT'),
            ('ALIGN', (1, 0), (1, 0), 'LEFT')
        ])
        elements.append(col_layout)
        elements.append(Spacer(1, 20))

        # Every two sections, insert a page break
        if (idx + 1) % 2 == 0:
            elements.append(PageBreak())

    # -- Final Notes --
    elements.append(Paragraph('Usage Rights and License', styles['Heading2']))
    elements.append(Spacer(1, 6))
    elements.append(Paragraph(
        'The use of this template is restricted to personal purposes only. Any commercial use or provision to third parties is strictly prohibited. Redistribution of the template, as well as the redistribution of any modified version or derivative works that incorporate modifications, is prohibited in all forms. The sale, transfer, or public use (including online sharing) of any part or the entirety of the template is also prohibited.',
        styles['BodyText']
    ))

    elements.append(Spacer(1, 14))
    elements.append(Paragraph('Explanation of Metrics', styles['Heading2']))
    elements.append(Spacer(1, 6))

    metrics_explanations = [
        {
            "title": "clicks",
            "description": "The number of times users clicked on your website’s URL from Google Search results."
        },
        {
            "title": "Impressions",
            "description": "The number of times your website’s URL appeared in Google Search results."
        },
        {
            "title": "MoM (Month-over-Month)",
            "description": "Percentage change from one month to the previous month."
        },
        {
            "title": "YoY (Year-over-Year)",
            "description": "Percentage change compared to the same month in the previous fiscal year."
        }
    ]
    for metric in metrics_explanations:
        text = f"<bullet>&bull;</bullet> <b>{metric['title']}</b>: {metric['description']}"
        elements.append(Paragraph(text, styles['BodyText']))
        elements.append(Spacer(1, 6))


    elements.append(Spacer(1, 12))
    elements.append(Paragraph('Explanation of Page Groups', styles['Heading2']))
    elements.append(Spacer(1, 6))

    pagegroup_explanations = [
        {
            "title": "All Pages",
            "description": "Aggregated performance data for every page on the website, offering a comprehensive view of overall traffic."
        },
        {
            "title": "Japanese Language Pages",
            "description": "Traffic data for pages primarily intended for Japanese audiences, identified by URLs that do not contain '/en/'."
        },
        {
            "title": "English Language Pages",
            "description": "Traffic data for pages targeted toward English-speaking audiences, recognized by the inclusion of '/en/' in the URL."
        }
    ]

    for pagegroup in pagegroup_explanations:
        text = f"<bullet>&bull;</bullet> <b>{pagegroup['title']}</b>: {pagegroup['description']}"
        elements.append(Paragraph(text, styles['BodyText']))
        elements.append(Spacer(1, 6))

    elements.append(PageBreak())


    # -- Build PDF --
    doc.build(elements)
    print(f"PDF '{pdf_path}' generated successfully.")


In [12]:
# ------------------------------------------------------------------------------
# 9. Run the PDF Generation
# ------------------------------------------------------------------------------
generate_pdf()

PDF 'gsc-page-group-report-monthly-simple-2025-04-15.pdf' generated successfully.
