In [None]:
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Responsible AI with Vertex AI Gemini API: Safety ratings and thresholds

<table align="left">

  <td>
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/generative-ai/blob/main/gemini/responsible-ai/gemini_safety_ratings.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Colab logo"> Run in Colab
    </a>
  </td>
  <td>
    <a href="https://github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/responsible-ai/gemini_safety_ratings.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo">
      View on GitHub
    </a>
  </td>
  <td>
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/responsible-ai/gemini_safety_ratings.ipynb">
      <img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo">
      Open in Vertex AI Workbench
    </a>
  </td>                                                                                               
</table>

## Overview

Large language models (LLMs) can translate language, summarize text, generate creative writing, generate code, power chatbots and virtual assistants, and complement search engines and recommendation systems. The incredible versatility of LLMs is also what makes it difficult to predict exactly what kinds of unintended or unforeseen outputs they might produce. 

Given these risks and complexities, the Vertex AI Gemini API is designed with [Google's AI Principles](https://ai.google/responsibility/principles/) in mind. However, it is important for developers to understand and test their models to deploy safely and responsibly. To aid developers, Vertex AI Studio has built-in content filtering, safety ratings, and the ability to define safety filter thresholds that are right for their use cases and business.

For more information, see the Google Cloud Generative AI documentation on [Responsible AI](https://cloud.google.com/vertex-ai/docs/generative-ai/learn/responsible-ai).

### Objectives

In this tutorial, you learn how to inspect the safety ratings returned from the Vertex AI Gemini API using the Python SDK and how to set a safety threshold to filter responses from the Vertex AI Gemini API.

The steps performed include:

- Call the Vertex AI Gemini API and inspect safety ratings of the responses
- Define a threshold for filtering safety ratings according to your needs

### Costs

This tutorial uses billable components of Google Cloud:

- Vertex AI

Learn about [Vertex AI pricing](https://cloud.google.com/vertex-ai/pricing) and use the [Pricing Calculator](https://cloud.google.com/products/calculator/) to generate a cost estimate based on your projected usage.


## Getting Started


### Install Vertex AI SDK for Python


In [None]:
! pip3 install --upgrade --user google-cloud-aiplatform

### Restart current runtime

To use the newly installed packages in this Jupyter runtime, it is recommended to restart the runtime. Run the following cell to restart the current kernel.

The restart process might take a minute or so.


In [None]:
import IPython

app = IPython.Application.instance()
app.kernel.do_shutdown(True)

After the restart is complete, continue to the next step.


<div class="alert alert-block alert-warning">
<b>⚠️ Wait for the kernel to finish restarting before you continue. ⚠️</b>
</div>


### Authenticate your notebook environment (Colab only)

If you are running this notebook on Google Colab, run the following cell to authenticate your environment. This step is not required if you are using [Vertex AI Workbench](https://cloud.google.com/vertex-ai-workbench).


In [1]:
import sys

# Additional authentication is required for Google Colab
if "google.colab" in sys.modules:
    # Authenticate user to Google Cloud
    from google.colab import auth

    auth.authenticate_user()

### Define Google Cloud project information (Colab only)

If you are running this notebook on Google Colab, specify the Google Cloud project information to use. In the following cell, you specify your project information, import the Vertex AI package, and initialize the package. This step is not required if you are using [Vertex AI Workbench](https://cloud.google.com/vertex-ai-workbench).


In [2]:
if "google.colab" in sys.modules:
    # Define project information
    PROJECT_ID = "[your-project-id]"  # @param {type:"string"}
    LOCATION = "us-central1"  # @param {type:"string"}

    # Initialize Vertex AI
    import vertexai

    vertexai.init(project=PROJECT_ID, location=LOCATION)

### Import libraries


In [3]:
from vertexai.preview.generative_models import (
    GenerationConfig,
    GenerativeModel,
    HarmCategory, 
    HarmBlockThreshold,
    Image,
    Part,
)

### Load the Gemini Pro model


In [4]:
model = GenerativeModel("gemini-pro")

# Set parameters to reduce variability in responses
generation_config = GenerationConfig(
    temperature=0,
    top_p=0.1,
    top_k=1,
    max_output_tokens=1024,
)

## Generate text and show safety ratings

Start by generating a pleasant-sounding text response using Gemini.

In [5]:
# Call Gemini API
nice_prompt = "Say three nice things about me"
responses = model.generate_content(
    contents=[nice_prompt],
    generation_config=generation_config,
    stream=True,
)

for response in responses:
    print(response.text, end="")

1. You are a kind and compassionate person. You always put others before yourself and are always willing to help those in need.
2. You are intelligent and curious. You are always eager to learn new things and are always up for a challenge.
3. You are funny and charming. You have a great sense of humor and can always make people laugh.

#### Inspecting the safety ratings

Look at the `safety_ratings` of the streaming responses.

In [6]:
responses = model.generate_content(
    contents=[nice_prompt],
    generation_config=generation_config,
    stream=True,
)

for response in responses:
    print(response)

candidates {
  content {
    role: "model"
    parts {
      text: "1. You are a kind and compassionate person. You always put others before yourself"
    }
  }
  safety_ratings {
    category: HARM_CATEGORY_HARASSMENT
    probability: NEGLIGIBLE
  }
  safety_ratings {
    category: HARM_CATEGORY_HATE_SPEECH
    probability: NEGLIGIBLE
  }
  safety_ratings {
    category: HARM_CATEGORY_SEXUALLY_EXPLICIT
    probability: NEGLIGIBLE
  }
  safety_ratings {
    category: HARM_CATEGORY_DANGEROUS_CONTENT
    probability: NEGLIGIBLE
  }
}

candidates {
  content {
    role: "model"
    parts {
      text: " and are always willing to help those in need.\n2. You are intelligent"
    }
  }
  safety_ratings {
    category: HARM_CATEGORY_HARASSMENT
    probability: NEGLIGIBLE
  }
  safety_ratings {
    category: HARM_CATEGORY_HATE_SPEECH
    probability: NEGLIGIBLE
  }
  safety_ratings {
    category: HARM_CATEGORY_SEXUALLY_EXPLICIT
    probability: NEGLIGIBLE
  }
  safety_ratings {
    category: 

#### Understanding the safety rating: category and probability

You can see the safety ratings, including each `category` type and its associated `probability` label.

The `category` types include:

* Harrassment: `HARM_CATEGORY_HARASSMENT`
* Hate speech: `HARM_CATEGORY_HATE_SPEECH`
* Sexually explicit statements: `HARM_CATEGORY_SEXUALLY_EXPLICIT`
* Dangerous content: `HARM_CATEGORY_DANGEROUS_CONTENT`

The `probability` labels are:

* `NEGLIGIBLE` - content has a negligble probability of being unsafe
* `LOW` - content has a low probability of being unsafe
* `MEDIUM` - content has a medium probability of being unsafe
* `HIGH` - content has a high probability of being unsafe

Try a prompt that might trigger one of these categories:

In [7]:
impolite_prompt = "Write a list of 5 disrespectful things that I might say to the universe after stubbing my toe in the dark:"

impolite_responses = model.generate_content(
    impolite_prompt,
    generation_config=generation_config,
    stream=True,
)

for response in impolite_responses:
    print(response)

candidates {
  content {
    role: "model"
    parts {
      text: "1. \"Thanks a lot, universe! I really appreciate you making me stub"
    }
  }
  safety_ratings {
    category: HARM_CATEGORY_HARASSMENT
    probability: NEGLIGIBLE
  }
  safety_ratings {
    category: HARM_CATEGORY_HATE_SPEECH
    probability: NEGLIGIBLE
  }
  safety_ratings {
    category: HARM_CATEGORY_SEXUALLY_EXPLICIT
    probability: NEGLIGIBLE
  }
  safety_ratings {
    category: HARM_CATEGORY_DANGEROUS_CONTENT
    probability: NEGLIGIBLE
  }
}

candidates {
  content {
    role: "model"
    parts {
      text: " my toe in the dark. That was super helpful.\"\n2. \"Seriously"
    }
  }
  safety_ratings {
    category: HARM_CATEGORY_HARASSMENT
    probability: NEGLIGIBLE
  }
  safety_ratings {
    category: HARM_CATEGORY_HATE_SPEECH
    probability: NEGLIGIBLE
  }
  safety_ratings {
    category: HARM_CATEGORY_SEXUALLY_EXPLICIT
    probability: NEGLIGIBLE
  }
  safety_ratings {
    category: HARM_CATEGORY_DANGEROU

#### Blocked responses

If the response is blocked, you will see that the final candidate includes `blocked: true`, and also observe which of the safety ratings triggered the blocking of the response (e.g. `finish_reason: SAFETY`).

In [8]:
rude_prompt = "Write a list of 5 very rude things that I might say to the universe after stubbing my toe in the dark:"

rude_responses = model.generate_content(
    rude_prompt,
    generation_config=generation_config,
    stream=True,
)

for response in rude_responses:
    print(response)

candidates {
  content {
    role: "model"
    parts {
      text: "1. \"Are you kidding me, universe? You couldn\'t give me"
    }
  }
  safety_ratings {
    category: HARM_CATEGORY_HARASSMENT
    probability: NEGLIGIBLE
  }
  safety_ratings {
    category: HARM_CATEGORY_HATE_SPEECH
    probability: NEGLIGIBLE
  }
  safety_ratings {
    category: HARM_CATEGORY_SEXUALLY_EXPLICIT
    probability: NEGLIGIBLE
  }
  safety_ratings {
    category: HARM_CATEGORY_DANGEROUS_CONTENT
    probability: NEGLIGIBLE
  }
}

candidates {
  content {
    role: "model"
    parts {
      text: " a little heads up about that table leg?\"\n2. \"Seriously, universe"
    }
  }
  safety_ratings {
    category: HARM_CATEGORY_HARASSMENT
    probability: NEGLIGIBLE
  }
  safety_ratings {
    category: HARM_CATEGORY_HATE_SPEECH
    probability: NEGLIGIBLE
  }
  safety_ratings {
    category: HARM_CATEGORY_SEXUALLY_EXPLICIT
    probability: NEGLIGIBLE
  }
  safety_ratings {
    category: HARM_CATEGORY_DANGEROUS_CON

### Defining thresholds for safety ratings

You may want to adjust the default safety filter thresholds depending on your business policies or use case. The Vertex AI Gemini API provides you a way to pass in a threshold for each category.

The list below shows the possible threshold labels:

* `BLOCK_ONLY_HIGH` - block when high probability of unsafe content is detected
* `BLOCK_MEDIUM_AND_ABOVE` - block when medium or high probablity of content is detected
* `BLOCK_LOW_AND_ABOVE` - block when low, medium, and high probabilities of unsafe content is detected
* `BLOCK_NONE` - always show, regardless of probability of unsafe content

#### Set safety thresholds
Below, the safety thresholds have been set to the most sensitive threshold: `BLOCK_LOW_AND_ABOVE`

In [9]:
safety_settings={
        HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_LOW_AND_ABOVE,
        HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_LOW_AND_ABOVE,
        HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_LOW_AND_ABOVE,
        HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_LOW_AND_ABOVE,
    }

#### Test thresholds

Here you will reuse the impolite prompt from earlier together with the most sensitive safety threshold. It should block the response even with the `LOW` probability label.

In [10]:
impolite_prompt = "Write a list of 5 disrespectful things that I might say to the universe after stubbing my toe in the dark:"

impolite_responses = model.generate_content(
    impolite_prompt,
    generation_config=generation_config,
    safety_settings=safety_settings,
    stream=True,
)

for response in impolite_responses:
    print(response)

candidates {
  content {
    role: "model"
    parts {
      text: "1. \"Thanks a lot, universe! I really appreciate you making me stub"
    }
  }
  safety_ratings {
    category: HARM_CATEGORY_HARASSMENT
    probability: NEGLIGIBLE
  }
  safety_ratings {
    category: HARM_CATEGORY_HATE_SPEECH
    probability: NEGLIGIBLE
  }
  safety_ratings {
    category: HARM_CATEGORY_SEXUALLY_EXPLICIT
    probability: NEGLIGIBLE
  }
  safety_ratings {
    category: HARM_CATEGORY_DANGEROUS_CONTENT
    probability: NEGLIGIBLE
  }
}

candidates {
  content {
    role: "model"
    parts {
      text: " my toe in the dark. That was super helpful.\"\n2. \"Seriously"
    }
  }
  safety_ratings {
    category: HARM_CATEGORY_HARASSMENT
    probability: NEGLIGIBLE
  }
  safety_ratings {
    category: HARM_CATEGORY_HATE_SPEECH
    probability: NEGLIGIBLE
  }
  safety_ratings {
    category: HARM_CATEGORY_SEXUALLY_EXPLICIT
    probability: NEGLIGIBLE
  }
  safety_ratings {
    category: HARM_CATEGORY_DANGEROU