Note: Below is only for Managed Identity:

In [14]:
# import openai
# from openai import AzureOpenAI
# import os 
# from azure.identity import ManagedIdentityCredential

# default_credential=ManagedIdentityCredential(client_id="XXX")
# token=default_credential.get_token("https://cognitiveservices.azure.com/.default")
# Resource_endpoint="XXXX"

# client = AzureOpenAI(
#   azure_endpoint = Resource_endpoint, 
#   api_key=token.token,  
#   api_version="2023-05-15"
# )

Install libraries:

In [None]:
pip install python-dotenv

In [None]:
pip install openai

In [9]:
import os
import openai
from openai import AzureOpenAI
from dotenv import load_dotenv

# Set up Azure OpenAI
load_dotenv("credentials.env")

openai.api_type = "azure"
    
client = AzureOpenAI(
    api_key=os.getenv("AZURE_OPENAI_API_KEY"),  
    api_version="2025-01-01-preview", #latest GA API version: https://learn.microsoft.com/en-us/azure/ai-services/openai/api-version-deprecation
    azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
    )

In [1]:
#print(os.getenv("AZURE_OPENAI_ENDPOINT"))

# Temperature

Defaults to 1, Optional

What sampling temperature to use, between 0 and 2. Higher values means the model will take more risks. Try 0.9 for more creative applications, and 0 for ones with a well-defined answer.

We generally recommend altering this or top_p but not both.

In [11]:
def call_openai(num_times, start_phrase, temperature):
    for i in range(num_times):
        deployment_name = "gpt-4o"  # Use the correct model identifier

        # Construct the conversation messages
        messages = [
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": start_phrase}
        ]

        # Send a chat completion request
        response = client.chat.completions.create(
            model=deployment_name,
            messages=messages,
            temperature=temperature,
            max_tokens=10
        )

        # Print the generated response content
        print(response.choices[0].message.content.strip())
        print("*****************************")


In [None]:
call_openai(5, 'Azure machine learning is ', temperature = 0)

In [None]:
call_openai(5, 'Azure machine learning is ', temperature =0.5)

# Top_p

Defaults to 1, Optional

top_p parameter which stands for “top probability” and an alternative to sampling with temperature, called nucleus sampling, where the model considers the results of the tokens with top_p probability mass. So 0.1 means only the tokens comprising the top 10% probability mass are considered. 

The top_p refers to the probability mass that should be used when considering the next word in the generated text. Essentially it ** sets a threshold for the probability of the next word being chosen and only considers the most likely words that exceed that threshold.**

We generally recommend altering this or temperature but not both.

In [14]:
def call_openai(num_times, start_phrase, top_p):
    for i in range(num_times):
        deployment_name = "gpt-4o"  # Use the correct model identifier

        # Construct the conversation messages
        messages = [
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": start_phrase}
        ]

        # Send a chat completion request
        response = client.chat.completions.create(
            model=deployment_name,
            messages=messages,
            top_p=top_p,
            max_tokens=30
        )

        # Print the generated response content
        print(response.choices[0].message.content.strip())
        print("*****************************")

In [None]:
call_openai(10, 'Azure machine learning is ', top_p = 0.1)

In [None]:
call_openai(10, 'Azure machine learning is ', top_p = 1)

# Max_Tokens 

Default value=16, Optional

The maximum number of tokens to generate in the completion. **The token count of your prompt plus max_tokens can't exceed the model's context length. **

In [None]:
deployment_name = "gpt-4o"  # This corresponds to the custom name of your deployed model.
start_phrase = "Azure machine learning is "

response = client.chat.completions.create(
    model=deployment_name,
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": start_phrase}
    ],
    temperature=1,
    max_tokens=105
)

print(response.model_dump_json(indent=2))


In [None]:
start_phrase = "Howden Group Holdings insurance "

response = client.chat.completions.create(
    model=deployment_name,  # This corresponds to your custom deployment name
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": start_phrase}
    ],
    temperature=1,
    max_tokens=90
)

print(response.model_dump_json(indent=2))


# n

Defaults to 1, optional

How many completions to generate for each prompt. To generate multiple completions, we specify the n request parameter, which simply stands for ** “number of completions” **

Note: Because this parameter generates many completions, it can quickly consume your token quota. Use carefully and ensure that you have reasonable settings for max_tokens and stop.

In [None]:
deployment_name = "gpt-4o"  # This is your custom deployment name

# Define the start phrase
start_phrase = "Azure machine learning is "

# Send a chat completion call to generate answers (3 completions)
response = client.chat.completions.create(
    model=deployment_name,
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": start_phrase}
    ],
    temperature=0.7,
    n=3  # Number of completions to generate
)

# Print each completion
for choice in response.choices:
    print(choice.message.content.strip())
    print("**************************")


# logprobs

Defaults to null, optional

Log probabilities of output tokens indicate **the likelihood of each token occurring in the sequence given the context.** To simplify, a logprob is log(p), where p = probability of a token occurring at a specific position based on the previous tokens in the context.

For example, if logprobs is 5, the API will return a list of the 5 most likely tokens. The API will always return the logprob of the sampled token, so there may be up to logprobs+1 elements in the response.

** tokens ** — is an array of tokens generated by the language model. Each token is a word or part of a word.

** token_logprobs ** — represents an array of log probabilities for each token in the tokens array. Log probability indicates the likelihood of the language model generating that token for the given prompt. The logprob values are negative, where smaller (more negative) numbers indicate a less likely outcome.

** top_logprobs ** — represents an array of log probability objects, representing tokens most likely to be used for the completion. For example, if we specify the request parameter top_p = 0.5, then top_logprobs would contain log probabilities for top 50% of generated tokens.

In [None]:
deployment_name = "gpt-4o"  # Your custom deployment name

start_phrase = "Azure machine learning is "

response = client.chat.completions.create(
    model=deployment_name,
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": start_phrase}
    ],
    temperature=0,
    n=1,
    logprobs=True  # Correct boolean value
)

print(response.choices[0].message.content.strip())


In [16]:
#response.choices[0].text

In [None]:
print(response.model_dump_json(indent=2))

In [None]:
pip install numpy

In [17]:
import numpy as np

In [None]:
print("Word values :",response.choices[0].logprobs.tokens)
print("Token log probabilities :",response.choices[0].logprobs.token_logprobs)
print("Top log linear probabilities :",np.round(np.exp(response.choices[0].logprobs.token_logprobs)*100,2))

print("Response:", response.choices[0].text)

# Presence_penalty

Defaults to 0, Optional -> 0 means there is really no penalty or reward for the same token appearing multiple times. 

Number between -2.0 and 2.0. Positive values penalize new tokens based on ** whether they appear in the text so far **, increasing the model's likelihood to talk about new topics.

Smaller values (minimum -2) decrease the penalty and increase the chances of a token appearing, while higher values (maximum 2) increase the penalty and decrease the chances of a token appearing.

In [None]:
deployment_name = "gpt-4o"  # Your Azure OpenAI deployment name

start_phrase = "Azure machine learning is "

response = client.chat.completions.create(
    model=deployment_name,
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": start_phrase}
    ],
    presence_penalty=2,
    max_tokens=50  # Use boolean, but may not return token-level info
)

# Output the response
print("Response:", response.choices[0].message.content.strip())


In [None]:
deployment_name = "gpt-4o"  # Your Azure OpenAI deployment name

start_phrase = "Azure machine learning is "

response = client.chat.completions.create(
    model=deployment_name,
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": start_phrase}
    ],
    presence_penalty=-2,
    max_tokens=50  # Use boolean, but may not return token-level info
)

# Output the response
print("Response:", response.choices[0].message.content.strip())


In [None]:
print(response.model_dump_json(indent=2))


# Frequency_penalty

Defaults to 0

Number between -2.0 and 2.0. Positive values ** penalize new tokens based on their existing frequency in the text so far **, decreasing the model's likelihood to repeat the same line verbatim.


# Best_of

Note: Works with Completion API not Chat_completion API. 
 Defaults to 1,optional

This parameter tells the language model to generate multiple completions and return the best one, which is the one with the highest log probability per token.


Note: Because this parameter generates many completions, it can quickly consume your token quota. Use carefully and ensure that you have reasonable settings for max_tokens and stop.



In [None]:
# deployment_name = "gpt-4o"  # Your Azure OpenAI deployment name  
  
# # Define the start phrase  
# start_phrase = "Azure OpenAI is "  
  
# # Correct method call is `client.chat.completions.create`  
# response = client.chat.completions.create(  
#     model=deployment_name,   
#     messages=[  
#         {"role": "system", "content": "You are a helpful assistant."},  
#         {"role": "user", "content": start_phrase}  
#     ],  
#     max_tokens=50,  
#     temperature=0.7
#     #best_of=3
      
# )  
  
# # Output the best completion  
# print("Response:", response.choices[0].message.content.strip())  

# logit_bias

Defaults to null

The logit_bias request parameter is used to modify the likelihood of specified tokens appearing in the completion. We can use this parameter to provide hints to the language model about **which tokens we want or don’t want to appear in the completion**. It basically allows us to make the model more biased towards certain keywords or topics.

In [None]:
#-100 bias, which should completely prevent them from appearing.
# 100 bias will show only that word

deployment_name = "gpt-4o"  # Your Azure OpenAI deployment name  
  
# Define the start phrase  
start_phrase = "Azure OpenAI is "  
  
# Correct method call is `client.chat.completions.create`  
response = client.chat.completions.create(  
    model=deployment_name,   
    messages=[  
        {"role": "system", "content": "You are a helpful assistant."},  
        {"role": "user", "content": start_phrase}  
    ],  
    max_tokens=50,  
    logit_bias={"30":10, "5936": 0}
      
)  
  
# Output the best completion  
print("Response:", response.choices[0].message.content.strip())  

## Echo and Stop

Note: The echo parameter only works with Completion API. 

By setting the echo parameter to true, you’re asking the language model to **return the prompt embedded within the completion.** This is useful for debugging.

In [None]:
deployment_name = "gpt-4o"  # Your Azure OpenAI deployment name  
  
# Define the start phrase  
start_phrase = "Azure OpenAI is "  
  
# Correct method call is `client.chat.completions.create`  
response = client.chat.completions.create(  
    model=deployment_name,   
    messages=[  
        {"role": "system", "content": "You are a helpful assistant."},  
        {"role": "user", "content": start_phrase}  
    ],  
    max_tokens=50,  
    #echo=False
      
)  
  
# Output the best completion  
print("Response:", response.choices[0].message.content.strip())  

In [None]:
deployment_name = "gpt-4o"  # Your Azure OpenAI deployment name  
  
# Define the start phrase  
start_phrase = "Azure OpenAI is "  
  
# Correct method call is `client.chat.completions.create`  
response = client.chat.completions.create(  
    model=deployment_name,   
    messages=[  
        {"role": "system", "content": "You are a helpful assistant."},  
        {"role": "user", "content": start_phrase}  
    ],  
    max_tokens=50,
    temperature=0,
    stop="Microsoft"  
      
)  
  
# Output the best completion  
print("Response:", response.choices[0].message.content.strip())  

The ** stop ** parameter allows you to specify up to 4 sequences of text on which the language model will halt and return the result. This is useful for specifying early termination triggers for the language model.

In [None]:
deployment_name = "gpt-4o"  # Your Azure OpenAI deployment name  
  
# Define the start phrase  
start_phrase = "Azure OpenAI is "  
  
# Correct method call is `client.chat.completions.create`  
response = client.chat.completions.create(  
    model=deployment_name,   
    messages=[  
        {"role": "system", "content": "You are a helpful assistant."},  
        {"role": "user", "content": start_phrase}  
    ],  
    max_tokens=50,
    temperature=0,
    stop=["Microsoft","since"]  
      
)  
  
# Output the best completion  
print("Response:", response.choices[0].message.content.strip())  

https://learn.microsoft.com/en-us/azure/ai-services/openai/reference