# Azure OpenAI Configuration

Add your [Azure Open AI Service key](https://learn.microsoft.com/azure/cognitive-services/openai/quickstart?pivots=programming-language-studio) settings to a `.env` file in the same folder:

```
AZURE_OPENAI_API_KEY="..."
AZURE_OPENAI_ENDPOINT="https://..."
AZURE_OPENAI_DEPLOYMENT_NAME="..."
```

and add Azure OpenAI Text Completion to the kernel

## Semantic Kernel version: semantic-kernel    0.3.1.dev0

In [1]:
import os
from dotenv import load_dotenv
# load AZURE_OPENAI_ENDPOINT, AZURE_OPENAI_API_KEY, OPENAI_API_VERSION and AZURE_OPENAI_API_TYPE
# plus COMPLETION4_DEPLOYMENT, to be assigned to the MODEL string
# plus BING_SUBSCRIPTION_KEY and BING_SEARCH_URL

load_dotenv("./../credentials_my.env")
MODEL = os.environ["GPT4-1106-128k"] 

from langchain.chat_models import AzureChatOpenAI
from langchain.embeddings import AzureOpenAIEmbeddings

llm = AzureChatOpenAI(deployment_name=MODEL, temperature=0, max_tokens=1000)
embeddings_model_2_model       = AzureOpenAIEmbeddings(deployment=os.environ["EMBEDDING_ADA_002"])
embeddings_model_3_small_model = AzureOpenAIEmbeddings(deployment=os.environ["EMBEDDING_ADA_003_SMALL"])
embeddings_model_3_large_model = AzureOpenAIEmbeddings(deployment=os.environ["EMBEDDING_ADA_003_LARGE"])

  warn_deprecated(
  warn_deprecated(


In [2]:
for m in (embeddings_model_2_model, embeddings_model_3_small_model, embeddings_model_3_large_model):
    embeddings = m.embed_query("anatine amigos")
    print(f"Embeddings in ({m.deployment}): {len(embeddings)}. First elements: {[round(embeddings[i],3) for i in [0,1,2,-1]]}")    

Embeddings in (text-embedding-ada-002): 1536. First elements: [-0.027, -0.001, 0.02, -0.023]


NotFoundError: Error code: 404 - {'error': {'code': 'DeploymentNotFound', 'message': 'The API deployment for this resource does not exist. If you created the deployment within the last 5 minutes, please wait a moment and try again.'}}

# Choose your model

In [3]:
model = embeddings_model_3_large_model

# Calculate Word Embeddings
To use word embeddings for semantic search, you first compute the embeddings for a corpus of text using a word embedding algorithm. What does this mean? We are going to create a numerical representation of each of these words. To perform this computation, we'll use OpenAI's 'get_embedding' function.

Since we have our words in a pandas dataframe, we can use "apply" to apply the get_embedding function to each row in the dataframe. We then store the calculated word embeddings in a new text file called "word_embeddings.csv" so that we don't have to call OpenAI again to perform these calculations.

# Learn how to generate embeddings with Azure OpenAI
https://learn.microsoft.com/en-us/azure/cognitive-services/openai/how-to/embeddings?tabs=console

In [4]:
import numpy as np
embeddings = model.embed_query("coniglio")
print(f"embeddings shape: {np.array(embeddings).shape}. Now showing the first 5 elements:\n{embeddings[:5]}")

embeddings shape: (3072,). Now showing the first 5 elements:
[-0.03792313490501229, -0.0036819255951237536, 0.0013807221563790686, 0.0026059928378761683, 0.01834674926626328]


# Cosine similarity
https://en.wikipedia.org/wiki/Cosine_similarity
![image.png](attachment:image.png)

In [5]:
v1 = np.array([1,2,3])
v2 = np.array([4,5,6])
numerator = np.dot(v1,v2) # 4+10+18
numerator

32

In [6]:
denominator_1 = np.linalg.norm(v1) # sqrt (1+4+9) = 3.74
denominator_2 = np.linalg.norm(v2) # sqrt (16+25+36) = 8.77
denominator   = denominator_1 * denominator_2
denominator

32.83291031876401

In [7]:
print(numerator / denominator)

0.9746318461970762


In [8]:
def cosine_similarity(v1,v2):
    numerator = np.dot(v1,v2)
    denumerator = np.linalg.norm(v1) * np.linalg.norm(v2)
    return numerator / denumerator

cosine_similarity(np.array([1,2,3]),np.array([4,5,6]))

0.9746318461970762

In [9]:
cosine_similarity(np.array([1,2,3,-1,2,3,-1,2,3,-1,2,3]),np.array([-4,5,6,4,5,-6,4,5,-6,4,5,6]))

0.1827434711619518

## Create an embedding for a single word

In [10]:
#openai.Embedding.create(input="rabbit", engine=azure_openai_deployment_emb)['data'][0]['embedding']
model.embed_query("rabbit")

[-0.05453244177818716,
 -0.029169240718396824,
 0.006434391197981275,
 0.010232631961414246,
 0.036999698413693785,
 0.006067825895358744,
 -0.0341295708793919,
 -0.003786541722180779,
 -0.005982033857498566,
 -0.009320118385275317,
 -0.033349644109138064,
 -0.010427613653977704,
 -0.006329100990864751,
 -0.006294004565600099,
 0.00178408120638711,
 0.022820640161291875,
 0.016066480291358126,
 0.004043917137269389,
 -0.004465077500074201,
 -0.03313126386840894,
 -0.012244840979759485,
 -0.006500685066585107,
 0.03359921993056124,
 -0.007506789575757726,
 -0.012018662775179413,
 0.06464028303492213,
 -0.013820292310614171,
 0.026361506208128164,
 -0.04891697275764985,
 -0.028202131150753047,
 0.04267756232090946,
 0.032413731984833466,
 0.005712959401157764,
 0.005642766084967176,
 -0.008992548955504194,
 -0.007744667054420631,
 -0.003710499001779416,
 0.019997307860676196,
 -0.007623778591295801,
 0.007838258453115605,
 -0.01893660782566001,
 -0.0046912561703155555,
 -0.06123980455178

## Check distance between two words

In [11]:
v1 = model.embed_query("coniglio")
v2 = model.embed_query("elefante")

cosine_similarity(v1,v2)

0.4093475665093529

# Read Data File Containing Words
Now that we have configured OpenAI, let's start with a simple CSV file with familiar words

In [12]:
import pandas as pd
file_name = 'words'
df = pd.read_csv(f'{file_name}.csv')
print(df)

            text
0            red
1       potatoes
2           soda
3         cheese
4          water
5           blue
6         crispy
7      hamburger
8         coffee
9          green
10          milk
11      la croix
12        yellow
13     chocolate
14  french fries
15         latte
16          cake
17         brown
18  cheeseburger
19      espresso
20    cheesecake
21         black
22         mocha
23         fizzy
24        carbon
25        banana


In [13]:
try:    
    #df["text_embeddings"] = df["text"].apply(lambda x: openai.Embedding.create(input=x, engine=azure_openai_deployment_emb)['data'][0]['embedding'])
    df["text_embeddings"] = df["text"].apply(lambda x: model.embed_query(x))
    df.to_pickle(f"{file_name}_enriched.pkl") # type(df['embeddings'][0][0]) --> float  
except:
    #print('switching to pickle file...')
    df = pd.read_pickle(f"{file_name}_enriched.pkl")

df

Unnamed: 0,text,text_embeddings
0,red,"[0.010682868276239597, -0.025816932986952614, ..."
1,potatoes,"[0.006450268986695935, 0.012142651867029376, -..."
2,soda,"[0.005380749317546352, -0.0293968547822167, -0..."
3,cheese,"[-0.03643320895200616, -0.006331850812509498, ..."
4,water,"[-0.02167165608120844, -0.047557695738994675, ..."
5,blue,"[0.003121743291673462, -0.04821173610925128, -..."
6,crispy,"[-0.01083600880696952, 0.0002024675025911774, ..."
7,hamburger,"[-0.01751951629333089, -0.00758232616990886, -..."
8,coffee,"[0.0063572316384040515, -0.034859091017991825,..."
9,green,"[0.012146398049303644, 0.01554770626453294, 0...."


In [14]:
search_term = input ('Insert a search term:\n') # hot dog

#search_term_vector = openai.Embedding.create(input=search_term, engine=azure_openai_deployment_emb)['data'][0]['embedding']
search_term_vector = model.embed_query(search_term)
df["similarity"] = df["text_embeddings"].apply(lambda x: cosine_similarity(x,search_term_vector))
df

Insert a search term:
hot dog


Unnamed: 0,text,text_embeddings,similarity
0,red,"[0.010682868276239597, -0.025816932986952614, ...",0.330123
1,potatoes,"[0.006450268986695935, 0.012142651867029376, -...",0.374107
2,soda,"[0.005380749317546352, -0.0293968547822167, -0...",0.371776
3,cheese,"[-0.03643320895200616, -0.006331850812509498, ...",0.437025
4,water,"[-0.02167165608120844, -0.047557695738994675, ...",0.276844
5,blue,"[0.003121743291673462, -0.04821173610925128, -...",0.253147
6,crispy,"[-0.01083600880696952, 0.0002024675025911774, ...",0.336796
7,hamburger,"[-0.01751951629333089, -0.00758232616990886, -...",0.570036
8,coffee,"[0.0063572316384040515, -0.034859091017991825,...",0.287668
9,green,"[0.012146398049303644, 0.01554770626453294, 0....",0.234057


In [15]:
df.sort_values("similarity", ascending=False)

Unnamed: 0,text,text_embeddings,similarity
7,hamburger,"[-0.01751951629333089, -0.00758232616990886, -...",0.570036
18,cheeseburger,"[-0.00887391267069774, -0.005885987651941204, ...",0.505226
3,cheese,"[-0.03643320895200616, -0.006331850812509498, ...",0.437025
14,french fries,"[-0.005983989235463095, -0.004254760697222744,...",0.418449
1,potatoes,"[0.006450268986695935, 0.012142651867029376, -...",0.374107
2,soda,"[0.005380749317546352, -0.0293968547822167, -0...",0.371776
16,cake,"[-0.048063532228851784, 0.0014677158541126047,...",0.370104
20,cheesecake,"[-0.03696693114159991, -0.026733472991572096, ...",0.355223
13,chocolate,"[-0.005418779911789121, -0.02792785563612241, ...",0.339274
6,crispy,"[-0.01083600880696952, 0.0002024675025911774, ...",0.336796


In [16]:
search_term = "hot dog"
#search_term_vector = openai.Embedding.create(input=search_term, engine=azure_openai_deployment_emb)['data'][0]['embedding']
search_term_vector = model.embed_query(search_term)

df["similarity"] = df["text_embeddings"].apply(lambda x: cosine_similarity(x,search_term_vector))
df = df.sort_values("similarity", ascending=False)
df

Unnamed: 0,text,text_embeddings,similarity
7,hamburger,"[-0.01751951629333089, -0.00758232616990886, -...",0.570036
18,cheeseburger,"[-0.00887391267069774, -0.005885987651941204, ...",0.505226
3,cheese,"[-0.03643320895200616, -0.006331850812509498, ...",0.437025
14,french fries,"[-0.005983989235463095, -0.004254760697222744,...",0.418449
1,potatoes,"[0.006450268986695935, 0.012142651867029376, -...",0.374107
2,soda,"[0.005380749317546352, -0.0293968547822167, -0...",0.371776
16,cake,"[-0.048063532228851784, 0.0014677158541126047,...",0.370104
20,cheesecake,"[-0.03696693114159991, -0.026733472991572096, ...",0.355223
13,chocolate,"[-0.005418779911789121, -0.02792785563612241, ...",0.339274
6,crispy,"[-0.01083600880696952, 0.0002024675025911774, ...",0.336796


# Campaign Speeches

In [17]:
file_name = 'discorso_presidente_del_consiglio'

df = pd.read_csv(f'{file_name}.csv')

df

Unnamed: 0,text
0,"Signor Presidente, onorevoli colleghi, sono in..."
1,Sono i momenti fondanti della nostra democrazi...
2,La celerità di questi giorni era per noi non s...
3,Tra i tanti pesi che sento gravare sulle mie s...
4,Ma il mio ringraziamento più sentito non può n...
5,"Intendiamo farlo, assumendoci pienamente i dir..."
6,Ci siamo presentati in campagna elettorale con...
7,L'Italia è a pieno titolo parte dell'Occidente...
8,Permettetemi innanzitutto di ringraziare i ver...
9,Perché è quello il luogo in cui l'Italia farà ...


In [18]:
try:
    df["text_embeddings"] = df["text"].apply(lambda x: 
       # openai.Embedding.create(input=x, engine=azure_openai_deployment_emb)['data'][0]['embedding'])
       model.embed_query(x))
    df.to_pickle(f"{file_name}_enriched.pkl") # type(df['embeddings'][0][0]) --> float
    df.to_csv(f"{file_name}_enriched.csv") # type(df['embeddings'][0][0]) --> string

except:
    print("switching to pickle file...")
    df = pd.read_pickle(f"{file_name}_enriched.pkl")
    
df

Unnamed: 0,text,text_embeddings
0,"Signor Presidente, onorevoli colleghi, sono in...","[0.008210286625426541, -0.0020790654557089772,..."
1,Sono i momenti fondanti della nostra democrazi...,"[0.012139380355880216, 0.0067720982882943145, ..."
2,La celerità di questi giorni era per noi non s...,"[0.018013088005980532, -0.014311088227936032, ..."
3,Tra i tanti pesi che sento gravare sulle mie s...,"[-0.017884448346279055, -0.0019103567070720181..."
4,Ma il mio ringraziamento più sentito non può n...,"[0.013222380812480021, -0.008586756129859093, ..."
5,"Intendiamo farlo, assumendoci pienamente i dir...","[0.004221303318800842, -0.025013405416804493, ..."
6,Ci siamo presentati in campagna elettorale con...,"[0.013726113250191813, -0.046657200515507175, ..."
7,L'Italia è a pieno titolo parte dell'Occidente...,"[-0.011417268520442324, 0.011669120490735986, ..."
8,Permettetemi innanzitutto di ringraziare i ver...,"[-0.012653328113663485, -0.026921461491752365,..."
9,Perché è quello il luogo in cui l'Italia farà ...,"[-0.02817936778068017, -0.017644284153098613, ..."


In [19]:
# accoglienza profughi extracomunitari --> flussi migratori, traffico di essere umani, immigrazione
# a quali gruppi appartiene l'Italia?
search_term = input ('Insert a search term:\n')

#search_term_vector = openai.Embedding.create(input=search_term, engine=azure_openai_deployment_emb)['data'][0]['embedding']
search_term_vector = model.embed_query(search_term)

df["similarity"] = df["text_embeddings"].apply(lambda x: cosine_similarity(x,search_term_vector))
df.sort_values("similarity", ascending=False).head(5)

Insert a search term:
accoglienza profughi extracomunitari


Unnamed: 0,text,text_embeddings,similarity
46,"Sicurezza e legalità, certo, riguardano anche ...","[0.003658902631433679, -0.013870511377368274, ...",0.448424
47,"E allora mancherà un’ultima cosa da fare, fors...","[-0.019456457557967523, -0.013937677005133407,...",0.365355
33,C'è un tema di povertà dilagante che non possi...,"[-0.016606982190065487, 0.013696602976115234, ...",0.349362
45,Vogliamo prendere l'impegno di riavvicinare ai...,"[0.020175408692413178, -0.023206828300627418, ...",0.3031
35,"Diversi studi dimostrano come, oggi, chi vive ...","[0.015329844517423998, -0.003895615278114097, ...",0.300728


## Generalize top paragraphs identification

In [20]:
def top_paragraphs_identification (search_term, top_terms=5):
    # search_term_vector = openai.Embedding.create(input=search_term, engine=azure_openai_deployment_emb)['data'][0]['embedding']
    search_term_vector = model.embed_query(search_term)
    df["similarity"] = df["text_embeddings"].apply(lambda x: cosine_similarity(x,search_term_vector))
    return df.sort_values("similarity", ascending=False).head(top_terms)


# possible questions:
# "A quali gruppi appartiene l'Italia?"
# "Che cosa hanno fatto le Forze Armate?"
# "Quali sono le donne italiane che hanno dimostrato valore?"
# "Quali sono le grandi sfide dell'Unione Europea?"
# "Quale strada vuole perseguire il governo?"

question = "Quali sono le donne italiane che hanno dimostrato valore?"
best_paragraphs_df = top_paragraphs_identification(question)
best_paragraph_nr = best_paragraphs_df.index[0]
best_paragraph_text = best_paragraphs_df["text"][best_paragraph_nr]
print(f"The best answer should be in paragraph #{best_paragraph_nr}:\n{best_paragraph_text}")

The best answer should be in paragraph #3:
Tra i tanti pesi che sento gravare sulle mie spalle oggi, non può non esserci anche quello di essere la prima donna a capo del governo in questa Nazione. Quando mi soffermo sulla portata di questo fatto, mi ritrovo inevitabilmente a pensare alla responsabilità che ho di fronte alle tante donne che in questo momento affrontano difficoltà grandi e ingiuste per affermare il proprio talento o il diritto di vedere apprezzati i loro sacrifici quotidiani. Ma penso anche, con riverenza, a coloro che hanno costruito con le assi del proprio esempio la scala che oggi consente a me di salire e rompere il pesante tetto di cristallo posto sulle nostre teste. Donne che hanno osato, per impeto, per ragione, o per amore. Come Cristina (Trivulzio di Belgioioso), elegante organizzatrice di salotti e barricate. O come Rosalie (Montmasson), testarda al punto da partire con i Mille che fecero l'Italia. Come Alfonsina (Strada) che pedalò forte contro il vento del pr

# Semantic Kernel in Action!
## Import Semantic Kernel SDK from pypi.org

In [None]:
# !python -m pip install -r requirements.txt (semantic-kernel 0.3.1.dev0 on 27/06/2023)

import semantic_kernel as sk

kernel_emb = sk.Kernel()
kernel_txt = sk.Kernel()

# retrieve environment variables for Open AI
azure_openai_deployment_txt, azure_openai_api_key, azure_openai_endpoint = sk.azure_openai_settings_from_dot_env()

from semantic_kernel.connectors.ai.open_ai import AzureTextCompletion

kernel_txt.add_text_completion_service(azure_openai_deployment_txt, 
    AzureTextCompletion(
        azure_openai_deployment_txt, 
        azure_openai_endpoint, 
        azure_openai_api_key))

kernel_txt.set_default_text_completion_service(azure_openai_deployment_txt)

# print(f"deployment:\t{azure_openai_deployment_txt}\napi_key:\t{azure_openai_api_key}\nendpoint:\t{azure_openai_endpoint}")
print(f"deployment:\t{azure_openai_deployment_txt}\napi_key:\t*****\nendpoint:\t{azure_openai_endpoint}")

## Two ways to create a semantic function

In [None]:
skills_directory    = "./skills"
skills_dataset_name = "mauromi_skills"
skill_name          = "find_and_search_function"

# method 1 - import the skills library
mauromi_skills = kernel_txt.import_semantic_skill_from_directory(skills_directory, skills_dataset_name)
find_and_search_function = mauromi_skills[skill_name]

# method 2 - use the skill text as prompt
prompt = open(os.path.join(skills_directory, skills_dataset_name, skill_name, "skprompt.txt")).read()
find_and_search_function = kernel_txt.create_semantic_function(prompt)

## Let's call *find_and_search_function* semantic function
BEAWARE: as a result of calling the execution of run_async, we get the following situation:
```
context_variables["input"] == output.result
```
that is the mechanism to create a chaining

In [None]:
# in this case we use the same question and "best paragraph" identified above

context_variables = sk.ContextVariables()
context_variables["question"] = question

answer = find_and_search_function(input=best_paragraph_text, variables=context_variables)

print(f"Question: {question}\n\nAnswer from paragraph {best_paragraph_nr}:\n{answer.result.lstrip('.').lstrip()}")

## Same test, but this time we use *kernel.run_async* that is useful for chaining
**ContextVariables** is the same collection object used above, to which we need to add the "input" variable

In [None]:
context_variables["input"] = best_paragraph_text

# beaware: 
output = await kernel_txt.run_async(find_and_search_function, input_vars=context_variables)
print(f"Question: {question}\n\nAnswer from paragraph {best_paragraph_nr}:\n{output.result.lstrip('.').lstrip()}")

## Chain multiple semantic functions

### Recall: the base of the chaining is based on the following assumption:
```
context_variables["input"] == output.result
```


## In this case, we use two skills
This code still does NOT use either:
- PLANNER --> this means that we manually concatenate the functions in the kernel.run_async method()
- NATIVE function --> so at the moment we can't concatenate the semantic function with the cosine similarity function. This means that <best_paragraph_text> is the 33th paragraph (#32)

https://learn.microsoft.com/en-us/semantic-kernel/howto/chainingfunctions

In [None]:
# possible questions:
# "A quali gruppi appartiene l'Italia?"
# "Che cosa hanno fatto le Forze Armate?"
# "Quali sono le donne italiane che hanno dimostrato valore?"
# "Quali sono le grandi sfide dell'Unione Europea?"

# set context variables
question                      = "Che cosa hanno fatto le Forze Armate?"
context_variables             = sk.ContextVariables()
context_variables["style"]    = 'se non trovi la risposta nel testo, rispondi NON LO SO'
context_variables["question"] = question

# identify the answer using embeddings
best_paragraphs_df            = top_paragraphs_identification(question)
best_paragraph_nr             = best_paragraphs_df.index[0]
best_paragraph_text           = best_paragraphs_df["text"][best_paragraph_nr]
context_variables["input"]    = best_paragraph_text

# import functions
find_and_search_function      = mauromi_skills['find_and_search_function']
translate_function            = mauromi_skills['translate_function']

# chained output
await kernel_txt.run_async(find_and_search_function, input_vars=context_variables) # translate_function,
print(f"Question: {question}\n\nAnswer from paragraph {best_paragraph_nr}:\n{context_variables['input'].lstrip('.').lstrip()}")

## Let's add a NATIVE FUNCTION

In our example, we'll create one file with two functions (uppercase and lowercase).
We'll create a single Python files for them, which must be called "native_function.py".

In [None]:
# possible questions:
# A quali gruppi appartiene l'Italia?
# Che cosa hanno fatto le Forze Armate?
# Quali sono le donne italiane che hanno dimostrato valore?
# Quali sono le grandi sfide dell'Unione Europea?

# set context variables
question                      = "A quali gruppi appartiene l'Italia?"
context_variables             = sk.ContextVariables()
context_variables["style"]    = 'se non trovi la risposta nel testo, rispondi NON LO SO'
context_variables["question"] = question

# identify the answer using embeddings
best_paragraphs_df            = top_paragraphs_identification(question)
best_paragraph_nr             = best_paragraphs_df.index[0]
best_paragraph_text           = best_paragraphs_df["text"][best_paragraph_nr]
context_variables["input"]    = best_paragraph_text

# import semantic functions
find_and_search_function      = mauromi_skills['find_and_search_function']
translate_function            = mauromi_skills['translate_function']

# import native functions
native_skills_dataset_name    = "mauromi_native_skills"
native_skills_file_name       = os.path.join(skills_directory,native_skills_dataset_name,'native_function.py')
mauromi_native_functions      = kernel_txt.import_native_skill_from_directory(
                                parent_directory=skills_directory, skill_directory_name=native_skills_dataset_name)
textcase_function             = mauromi_native_functions['uppercase'] # uppercase or lowercase

# chained output
await kernel_txt.run_async(find_and_search_function, translate_function, textcase_function, input_vars=context_variables)
print(f"Question: {question}\n\nAnswer from paragraph {best_paragraph_nr}:\n{context_variables['input'].lstrip('.').lstrip()}")