# Using Gemini to validate documenation topics PLAYWRIGHT VERSION

Copyright 2025 Google LLC.
SPDX-License-Identifier: Apache-2.0

This notebook iterates through a list of documentation topics and uses Gemini to check the semantic validity of links in the documenation topics. For each link in a topic, the notebook uses [Playwright](https://playwright.dev/) to navigate to the destination of the link, takes a screenshot of the destination, and then uses Gemini to evaluate whether the destination of the link matches the link text at the source and whether there is an error at the destination.

Specifically, this notebook:

* Ingests the [CSV containing URLs for doc topics](https://raw.githubusercontent.com/markbpryan/using_gemini_to_validate_doc_topics/main/input_urls.csv) into a dataframe
* Launches Playwright
* Launches Gemini 2.5 Gen AI client using [Google GenerativeAI APIs](https://ai.google.dev/api/python/google/generativeai)
* Iterates through the doc URLs. For each doc URL:
    * Get the HTML for doc topic at the URL
    * Iterate through the list of links for the URL and for each link in the HTML, get the `href` value, and the link text.
    * For each link, use Playwright to navigate to the destination of the link and save a screencap of the destination.
    * Use Gemini again to compare the link text with the screencap for that link's destination to assess if the link text matches the text in the link destination and whether there is some kind of error at the destination.

* The results of this validation are displayed in a table where you can click to get further details.



![Overview](https://raw.githubusercontent.com/markbpryan/using_gemini_to_check_links/refs/heads/main/notebook_diagram.png)
  




<table align="left">
  <td style="text-align: center">
    <a href="https://colab.research.google.com/github/markbpryan/using_gemini_to_check_links/blob/main/using_gemini_to_check_links_pw.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Google Colaboratory logo"><br> Open in Colab
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/markbpryan/using_gemini_to_check_links/blob/main/using_gemini_to_check_links.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo"><br> View on GitHub
    </a>
  </td>
</table>

## Set up libraries and imports

In [None]:
!pip install pytest-playwright
!playwright install

In [None]:
from enum import Enum
import getpass
import pathlib
import tempfile
import time
import json
from urllib.parse import urlparse
import pandas as pd
import posixpath
import os
import logging
import yaml
import re
import requests
import logging


import base64
import ipywidgets as widgets


import bs4
from IPython import display
import PIL.Image
from PIL import Image as PImage
import asyncio
import playwright
from playwright import async_api, sync_api
from playwright.sync_api import sync_playwright
from playwright.async_api import async_playwright

import google.generativeai as genai
from google.generativeai.types import safety_types
from google.colab import files








# Set parameters

Set values of variables used to control operation of the notebook.



In [None]:



# switch to control whether exception handling allows execution to continue (False)
# or messages for errors are shown and exceptions end execution
show_exceptions = True
# start time for execution
initial_timestamp = time.time()
# path for output
display_screencaps = False
# text to search for in screencaps that indicates broken link to console
message_text_list = ["error message", "model is not found"]
message_text = "Error message"
# expected prefix for console links
link_prefix = "https://cloud.google.com"
# scroll down for screen caps to avoid including boilerplate at the top of the screen
vertical_offset = 50
# dimensions for screen caps
window_width = 1380  # @param {type: "number"} was 1080
window_height = 720  # @param {type: "number"}
# space out prediction calls
predictions_space_out = 1 # seconds
page_load_timeout = 60 # originally 60
# placeholder record for topics where collection of console link details fails
placeholder_dict_list = [{'console_link': 'https://console.cloud.google.com/vertex-ai/model-garden', 'console_link_text': 'PLACEHOLDER',  'image_title': 'PLACEHOLDER', 'title_match': 'YES'}]
placeholder_console_link = 'https://console.cloud.google.com/vertex-ai/model-garden'
placeholder_console_link_text = 'PLACEHOLDER'
placeholder_image_title = 'PLACEHOLDER'
placeholder_title_match = 'YES'
placeholder_string = '{"console_link": "https://console.cloud.google.com/vertex-ai/model-garden", "console_link_text": "PLACEHOLDER",  "image_title": "PLACEHOLDER", "title_match": "YES"}]'
placeholder_dict = {"console_link": "https://console.cloud.google.com/vertex-ai/model-garden", "console_link_text": "PLACEHOLDER", "image_title": "PLACEHOLDER", "title_match": "YES"}
placeholder_html = """<article class="devsite-article">
<div background="google-blue" class="devsite-banner devsite-banner-announcement nocontent">
<div class="devsite-banner-message">
<div class="devsite-banner-message-text">
<cloudx-free-trial-eligible-content>
<li>In the Google Cloud console, go to the <b>Vertex AI Studio</b> page.
    <p><a class="button button-primary" href="https://console.cloud.google.com/vertex-ai/generative/language" target="console" track-name="consoleLink" track-type="task">Go to
    PLACEHOLDER</a></p></li>
</article>
"""
sandbox_string = "sandbox"
ask_api_key = True
test_dataset_url = "https://raw.githubusercontent.com/markbpryan/using_gemini_to_check_links/refs/heads/main/input_urls.csv"

In [None]:
timestamp = time.time()
log_filename = str(timestamp)+'.log'
logger = logging.getLogger(__name__)
logging.basicConfig(filename=log_filename, encoding='utf-8', level=logging.DEBUG)

# Ingest CSV file

Read CSV file containing URLs whose links will be validated into a Pandas dataframe

In [None]:


df = pd.read_csv(test_dataset_url)
# add placeholder result column
df["result"]=''
print(df.head(5))

In [None]:
df.head()

# Set up sandbox directory

Set up local directory to save screenshots.


In [None]:
def setup_sandbox_directory():

  cwd = os.getcwd()
  # ensure that sandbox directory has not already been set up
  if sandbox_string not in cwd:
    path = os.path.join(cwd,sandbox_string+"_"+str(timestamp))
    if not os.path.exists(path):
      os.makedirs(path)
  else:
    path = cwd
  return path


In [None]:
sandbox_path = setup_sandbox_directory()
logger.debug("sandbox_path: ",sandbox_path)

## Launch Playwright

Set parameters and start [Playwright](https://playwright.dev/) client. We will use Playwright to navigate to the destinations of links in the input topics so we can get screenshots of the link destinations.

In [None]:
class Agent:
    def __init__(
      self,
      page: async_api.Page,
    ):
        self.page = page

    async def goto(self, url: str) -> None:
        await self.page.goto(url)

    async def get_title(self) -> str:
        return await self.page.title()

# WebAgent Definition

Used to capture screenshots for link destinations.


In [None]:

class WebAgent:

#
    def __init__(
      self,
      out_dir: pathlib.Path,
      window_height: int,
      window_width: int,
    ):
        self.out_dir = out_dir
        self.action_idx = 0
        self.window_height = window_height
        self.window_width = window_width


# wd
    def _set_screenshot(self) -> None:
        # Wait for page to load before capturing screenshot.
        time.sleep(predictions_space_out)
        timestamp_ms = int(time.time() * 1000)
        screenshot_filename = f'{timestamp_ms}_screenshot.png'
        screenshot_path = os.path.join(self.out_dir, screenshot_filename)
        self.filename = screenshot_path



#
    async def get(self, url: str) -> None:
        async with async_api.async_playwright() as p:
            browser = await p.chromium.launch(headless=True)
            try:
                context = await browser.new_context(
                    viewport={'width': self.window_width, 'height': self.window_height}
                )
                page = await context.new_page()
                agent = Agent(page=page)
                await page.goto(url)
                # add sleep to allow page to load completely
                time.sleep(predictions_space_out)
                # set filenames
                self._set_screenshot()
                await page.screenshot(path=self.filename)
                image = display.Image(filename=self.filename)
                # crop the screencaps to omit boilerplate
                cropping_box = (0, vertical_offset, self.window_width, self.window_height)
                self.image1 = PImage.open(self.filename).crop(cropping_box).convert('RGB')
                #gfile.Copy(self.filename, self.cns_filename)
            except Exception as e:
                print(e)
            finally:
                await browser.close()

# Initialize agent.
#agent = WebAgent(driver=driver, out_dir=sandbox_path)
agent = WebAgent(out_dir=sandbox_path,window_height=window_height, window_width=window_width)

display.display('Agent is now initialized.')

## Initialize GenAI client

This notebook uses the [Google GenerativeAI APIs](https://ai.google.dev/api/python/google/generativeai).

You can create an API key here: https://aistudio.google.com/app/apikey.

For this demo, we will use the Gemini Flash 2.5.

In [None]:
# set parameters
temperature = 0.2  # @param {type:"slider", min:0, max:1, step:0.1}
response_mime_type = 'application/json'
max_output_tokens = 8192  # @param {type: "number"}
model_name = 'gemini-2.5-flash'  # @param {type: "string"}
api_key = ''  # @param {type: "string"}

while api_key == '' or api_key is None:
  api_key = getpass.getpass('Enter your API key:')



class GenAiClient:

  def __init__(self, model_name: str, api_key: str):
    genai.configure(api_key=api_key)
    self.generation_config = genai.types.GenerationConfig(
        temperature=temperature,
        response_mime_type=response_mime_type,
        max_output_tokens=max_output_tokens,
    )

    self.model = genai.GenerativeModel(
        model_name=model_name,
    )
    self.chat = self.model.start_chat(history=[])



  def predict_visual(self, prompt: str, img: PIL.Image) -> str:
    response = self.model.generate_content([prompt, img])
    return response.text



  def predict(self, prompt: str) -> str:
    response = self.chat.send_message(
        content=prompt, generation_config=self.generation_config
    )
    return response.text


try:
  genai_client = GenAiClient(model_name=model_name, api_key=api_key)
  genai_client.predict('Hi Gemini!!! :D')
except Exception as e:
  if show_exceptions:
    raise
  else:
    logger.debug("ERROR is ",e)


# Getting Content from a URL

To form a prompt for Gemini prompt that includes the key text from the topice, we need to get the required text from the topic. This function takes a doc topic URL as input and returns an object containing the HTML of the rendered topic.

In [None]:
# function to get HTML content of a topic
def get_html_content(input_url: str) -> str:
  target_url = 'https://'+input_url # @param {type: "string"}
  response = requests.get(target_url)
  soup = bs4.BeautifulSoup(response.text, 'html.parser')
  devsite_article = soup.findAll(class_='devsite-article')[0]
  return devsite_article



## String Cleanup Functions

The model returns strings that need cleanup to prepare them to be converted into Python structures: dicts and lists. These functions perform the required cleanup.

In [None]:
# function to remove flotsam prior to the first "{" to facilitate conversion of string to Python dictionary
def remove_prefix(text, prefix):
  if text.startswith(prefix):
    return text
  else:
    if prefix in text:
      return text[text.index(prefix):]
    else:
      return placeholder_string



In [None]:
# function to remove flotsam after the last "}" to facilitate conversion of string to Python dictionary
def remove_suffix(text, suffix):
  if text.endswith(suffix):
    return text
  else:
    if suffix in text:
      return text[:(text.rindex(suffix)+1)]
    else:
      return remove_suffix(text,"}")+suffix


In [None]:
# prep string returned by model to be run through json_load
def prep_json_load(input_text):
  output_text = remove_prefix(input_text, "{")
  output_text = remove_suffix(output_text, "]")
  output_text = output_text.replace("\\n","").replace("\n","").replace("    "," ").replace("   "," ")
  output_text = "[" + output_text
  return output_text

In [None]:
def replace_null(text):
  # to avoid double quotes, remove quotes surrounding nulls
  new_text = text.replace('"null"',"null")
  return new_text.replace("null",'"null"')

# Get Link Details

For a given topic URL, get the HTML for the topic and then use Beautiful Soup to get the console link and console link text.

In [None]:
# get the link details from HTML using Beautiful Soup
def get_link_details(topic_URL):
  dict_list = []
  # get the HTML for the doc topic
  logger.debug("BSLINK HTML about to ingest ")
  # exception handling around code that can cause issues in long runs
  try:
    topic_content = get_html_content(topic_URL)
  except Exception as e:
    if show_exceptions:
      raise
    else:
      topic_content = placeholder_html
      logger.debug("error with URL ",row["URL"])
      logger.debug("ERROR is ",e)
  # create soap object

  full_url = "https://"+topic_URL
  soup = bs4.BeautifulSoup(str(topic_content), 'html.parser')


  # find all the anchor tags with "href"
  # attribute starting with "https://console"
  for link in soup.find_all('a'):
      new_dict = {}
      new_dict["console_link"] = link.get('href')
      new_dict["console_link_text"] = link.text
      logger.debug("dict is ",new_dict)
      dict_list.append(new_dict)

  return dict_list

# Model call function

`compare_link_with_screencap` function invokes the model to determine whether the link destinations contain text that matches the link text and whether there is an error at the destination.

In [None]:
# use model to compare link details with contents of screencap of destintation
async def compare_link_with_screencap(dict,agent):
  # get the screencap associated with the console link
  logger.debug("SCREENCAP CONSOLE LINK COMPARE dict is ",dict)
  try:
    await agent.get(dict["console_link"])
  except Exception as e:
    if show_exceptions:
      raise
    else:
      print("error with URL ",row["URL"])
      print("ERROR is ",e)
      asyncio.run(agent.get(placeholder_console_link))
      print("GOT PLACEHOLDER CONSOLE LINK")
  prompt2 = f"""
    You are an expert at determining whether a text appears in a UI screenshot image.

    Given the console link text, the error message text, and the UI screenshot image, determine:
    1. Does the console link text roughly match the image title?
    2. Does the error message appear somewhere in the image?

    The image title could be text at the top of the image or it could be text in the navigation pane at the left of the image.

    The error message text could appear anywhere in the image.

    Your answers should have the following format. The answers should be valid Python dictionaries with keys as shown and double quotes around values that you fill in.
    ```
    {{"image_title": title in the image,"title_match": YES or NO,"message_text": YES or NO}}
    ```
    Here are the console link text and error message text:
    {dict["console_link_text"]} and {message_text}
  """
  logger.debug("ABOUT TO SECOND PREDICT with dict is ",dict)
  try:
    prediction = genai_client.predict_visual(prompt2, agent.image1)
    logger.debug("SCREENCAP CONSOLE LINK PREDICTION is ",prediction)
    dict_string = remove_prefix(prediction, "{")
    dict_string = remove_suffix(dict_string, "}")
    sub_dict = eval(replace_null(dict_string))
    new_dict = {**dict, **sub_dict}
    new_dict["screenshot"] =  str(agent.filename)
  except Exception as e:
    if show_exceptions:
      raise
    else:
      new_dict = placeholder_dict
      logger.debug("error with URL ",row["URL"])
      logger.debug("ERROR is ",e)
  if new_dict["image_title"] == "null":
    new_dict["image_title"] = "PLACEHOLDER"
  return new_dict




## Create human readable report

The direct output of the model is hard for a human to parse, so create a separate dataframe that parses the results for each link in a human-readable form.

In [None]:
# function to remove internal "{" and "}"
def remove_internal_chars(text,char_list):
  logger.debug("REMOVE INTERNAL CHARS text is ",text)
  # remove in initial "{"
  intermed_text = text[1:]
  # remove final "}"
  intermed_text = intermed_text[:-1]
  # remove internal
  for char in char_list:
    intermed_text = intermed_text.replace(char,'')
  # add back
  intermed_text = "{" + intermed_text + "}"
  logger.debug("REMOVE INTERNAL CHARS intermed_text2 is ",intermed_text)
  return intermed_text

In [None]:
# parse results into human-readable form
def create_human_readable_df(df):
  # iterate through the rows of the original df
  human_df = df.explode(['result'],ignore_index=True)
  for index, row in human_df.iterrows():
    for key in row['result']:
      human_df.at[index,key] = row['result'][key]
  return human_df

# Remove Extraneous Content from Output

Remove the extraneous columns and rows from the dataframe to prepare for output in a human-readable form.

In [None]:

def remove_extraneous_df_content(human_df):
  # remove extraneous columns
  human_df.drop(columns=['result'],inplace=True)
  return human_df


# Save Dataframes to persistent storage

Save raw DF to a CSV file. Prepare human-readable dataframe and save it to a CSV file.


In [None]:
# output the human readable dataframe to a CSV in the local file system
def output_human_readable_df(df,timestamp):
  human_df = create_human_readable_df(df)
  human_df = remove_extraneous_df_content(human_df)
  # save to the selected output location
  filename = os.path.join(sandbox_path,"human_readable_"+f'output_{timestamp}.csv')
  human_df.to_csv(filename, sep=',', index=False)
  return human_df


In [None]:
def output_raw_df(df,timestamp):
  print(df.head())


In [None]:
# complete partial links
# for links that aren't already fully-qualified, add prefix to make them fully qualified
def cleanup_link(source_url,link):
  if link.startswith("https://"):
    return link
  elif link.startswith("#"):
      return source_url+link
  elif link.startswith("/"):
      return "https://cloud.google.com"+link
  else:
    return "LINK ERROR"

## Main model calls

Main iteration through the list of input URLs:

* For each URL in the input dataset, call `get_link_details` to get the list of links and associated link text for all the links in the topic
* For each link and associated link text, call `compare_link_with_screencap` to compare the link details with the destination using Gemini and Playwright

In [None]:
%%time
# iterate through df with one doc topic URL at a time
async def main():
  print("IN MAIN")
  for index, row in df.iterrows():
      print("MAIN NEW ROW - URL is : ",row["URL"] )
      logger.debug("MAIN NEW ROW - URL is : ",row["URL"] )
      new_dict_list = []
      # get the details for the current doc topic
      dict_list = get_link_details(row["URL"])
      logger.debug("MAIN dict_list is ",dict_list)
      # for every console link in the doc topic, compare the doc topic console link details with the target in the console
      for dict in dict_list:
        new_dict = {}
        sub_dict = {}
        # for each console link for this URL, compare
        logger.debug("MAIN comparing image for console link: ",dict["console_link"])
        logger.debug("MAIN for console link text: ",dict["console_link_text"])
        # check to the link is valid for analysis
        dict["console_link"] = cleanup_link(row["URL"],dict["console_link"])
        if dict["console_link"].startswith(link_prefix):
          logger.debug("analyzing link: ",dict["console_link"])
          new_dict = await compare_link_with_screencap(dict,agent)
          new_dict_list.append(new_dict)
        else:
          logger.debug("MAIN SKIPPING CONSOLE LINK")
      # if list of dicts is not empty, save it in the df. If it is empty, save placeholder
      if len(new_dict_list) > 0:
        logger.debug("MAIN WRITING DF ROW",new_dict_list)
        df.at[index,'result'] = new_dict_list
      else:
        logger.debug("WRITING DEFAULT DF ROW")
        df.at[index,'result'] = placeholder_dict_list
      timestamp = time.time()
  timestamp = time.time()
  delta = timestamp - initial_timestamp
  logger.debug("completed analysis")
  print("completed analysis")
  output_raw_df(df,timestamp)


In [None]:

await main()

In [None]:
df.head()

In [None]:
# create human-readable form of output
timestamp = time.time()
human_readable_df = output_human_readable_df(df,timestamp)

In [None]:
human_readable_df.head()

# Display the Validation Results

To make it easier to examine the results of the validation, the following cells generate a tabular output of the validation results where you can click on the link and screencaps to examine the differences. The output dataframe is filtered to just contain the links where the analysis noted an issue:

* **title match == NO** indicating that the link text does not match prominent text in the destination
* **error message == YES** indicating that there is some kind of error message at the destination

In [None]:



# show image from path
def show_image_popup(image_path_or_url):
    """Displays an image in a new browser window or tab."""

    # If it's a local file path, convert it to a data URL for embedding
    if not image_path_or_url.startswith("http"):
        with open(image_path_or_url, "rb") as f:
            image_data = f.read()
        image_base64 = base64.b64encode(image_data).decode("utf-8")
        image_src = f"data:image/jpeg;base64,{image_base64}"
    else:
        image_src = image_path_or_url

    # Create the HTML for the popup window
    popup_html = f"""
    <html>
    <head></head>
    <body>
        <img src="{image_src}" alt="Image">
    </body>
    </html>
    """

    # Open the popup using JavaScript
    display(HTML(f"""
    <script>
    var win = window.open('', '_blank');
    win.document.write(`{popup_html}`);
    </script>
    """))




In [None]:
# filter df to just have the potential problems
problem_df = human_readable_df.copy()
problem_df = problem_df[(problem_df['title_match'] == 'NO') | (problem_df['message_text'] == 'YES')]

In [None]:
problem_df.head()

In [None]:
# Show the results of the link validation in a clickable format
from IPython.display import display, HTML, Image

# display results in interactive way
def create_row_widget(row):
  topic_widget = widgets.HTML(value=f'<a href="{"https://"+row["URL"]}" target="_blank">{row["URL"]}</a>')
  url_widget = widgets.HTML(value=f'<a href="{row["console_link"]}" target="_blank">{row["console_link"]}</a>')
  link_text_widget = widgets.HTML(value=f'{row["console_link_text"]}')
  title_match_widget = widgets.HTML(value=f'{row["title_match"]}')
  message_text_widget = widgets.HTML(value=f'{row["message_text"]}')
  image_button = widgets.Button(description= os.path.basename(row['screenshot']))
  # os.path.basename(your_path)

  def on_button_click(b):
    #show_image_popup(b.description)
    show_image_popup(row['screenshot'])

  image_button.on_click(on_button_click)
  return topic_widget, url_widget, link_text_widget, title_match_widget, message_text_widget, image_button

def display_interactive_table(df):
  header_topic = widgets.Label(value="topic URL")
  header_url = widgets.Label(value="link")
  header_link_text = widgets.Label(value="link text")
  header_title_match = widgets.Label(value="title match?")
  header_message_text = widgets.Label(value="error message?")
  header_image = widgets.Label(value="screenshot")

  grid = widgets.GridBox(
      children=[header_topic, header_url, header_link_text, header_title_match, header_message_text, header_image],
      layout=widgets.Layout(grid_template_columns="repeat(6, 1fr)",grid_gap="8px")
  )
  for _, row in df.iterrows():
    topic_widget, url_widget, link_text_widget, title_match_widget, message_text_widget, image_button = create_row_widget(row)
    grid.children += (topic_widget, url_widget, link_text_widget, title_match_widget, message_text_widget, image_button)
  display(grid)


display_interactive_table(problem_df)