# Step 4: Azure Cognitive Search

**Content**

* Create Azure Search Index
* Create Azure Search JSON
* Upload JSON documents on Azure Search

### References: 
* https://docs.microsoft.com/en-us/learn/modules/intro-to-azure-search/2-what-is-azure-search
* https://docs.microsoft.com/en-us/azure/search/cognitive-search-tutorial-blob-python
* https://docs.microsoft.com/en-us/azure/search/search-get-started-python
* https://docs.microsoft.com/pt-br/python/api/overview/azure/search-documents-readme?view=azure-python.
* https://docs.microsoft.com/en-us/python/api/azure-search-documents/azure.search.documents.searchclient?view=azure-python
* https://github.com/Azure-Samples/azure-search-python-samples/blob/master/Quickstart/REST/azure-search-quickstart.ipynb
* https://docs.microsoft.com/en-us/rest/api/searchservice/addupdate-or-delete-documents
* https://docs.microsoft.com/en-us/rest/api/searchservice/create-index
* https://docs.microsoft.com/en-us/azure/search/search-indexer-troubleshooting
* https://docs.microsoft.com/pt-br/azure/search/search-what-is-an-index

In [1]:
! pip install azure-search-documents


Collecting azure-search-documents
  Downloading azure_search_documents-11.3.0-py3-none-any.whl (244 kB)
     -------------------------------------- 244.1/244.1 KB 1.9 MB/s eta 0:00:00
Installing collected packages: azure-search-documents
Successfully installed azure-search-documents-11.3.0


You should consider upgrading via the 'c:\users\blueshift\appdata\local\programs\python\python38\python.exe -m pip install --upgrade pip' command.


In [2]:
import json
import requests
from pprint import pprint
from azure.core.credentials import AzureKeyCredential
from azure.search.documents.indexes import SearchIndexClient 
from azure.search.documents import SearchClient
from azure.search.documents.indexes.models import (
     ComplexField,
     CorsOptions,
     SearchIndex,
     ScoringProfile,
     SearchFieldDataType,
     SimpleField,
     SearchableField
 )
import yaml, os
import os.path 
from os import listdir
from collections import Counter


Load Configs

In [3]:
config_file = os.path.join("config","config.yaml")
with open(config_file, 'r') as ymlfile:
    config = yaml.load(ymlfile, Loader=yaml.FullLoader)

##### Configurações do Search:
service_name=config['search']['service_name']
admin_key = config['search']['admin_key']
index_name=config['search']['index_name']
endpoint = "https://{}.search.windows.net/".format(service_name)
api_version = config['search']['api_version']
headers = {'Content-Type': 'application/json',
           'api-key': admin_key}

##### Configurações do Azure Storage
container_name= config['azure_storage']['container_name_audios']
az_storage_sas_token = config['azure_storage']['sas_token']
az_storage_name = config['azure_storage']['storage_name']
az_storage_uri = "https://{name}.dfs.core.windows.net/{container}/".format(name=az_storage_name, container=container_name)


In [4]:
search_client = SearchClient(endpoint=endpoint,
                           index_name=index_name,
                           credential=AzureKeyCredential(admin_key))

admin_client = SearchIndexClient(endpoint=endpoint,
                           index_name=index_name,
                           credential=AzureKeyCredential(admin_key))

    
def delete_search_index(service_name, index_name, admin_key):
    '''Delete a search index by SDK '''
    admin_client = SearchIndexClient(endpoint=endpoint,
                           index_name=index_name,
                           credential=AzureKeyCredential(admin_key))
    try:
        result = admin_client.delete_index(index_name)
        print ('Index', index_name, 'Deleted')
    except Exception as ex:
        print (ex)
        

## Create Azure Search Index

In [5]:
#Informações sobre os campos do index
index_schema = {
   "name": index_name,
   "fields": [
     {"name": "nome_audio", "type": "Edm.String", "key": "true", "filterable": "true"},
     {"name": "blob_location", "type": "Edm.String", "searchable": "false", "filterable": "false", "sortable": "false", "facetable": "false"},
     {"name": "classificacao", "type": "Collection(Edm.String)", "searchable": "true", "filterable": "true", "sortable": "false", "facetable": "true"},
     {"name": "protocolo", "type": "Edm.String", "searchable": "true", "filterable": "false", "sortable": "false", "facetable": "false"},
     {"name": "placa", "type": "Edm.String", "searchable": "true", "filterable": "false", "sortable": "false", "facetable": "false"},
     {"name": "email", "type": "Edm.String", "searchable": "true", "filterable": "false", "sortable": "false", "facetable": "false"}
    ]
   }

In [6]:
def create_search_index_rest(endpoint, api_version, headers, index_schema):
    '''Create a search index by REST API'''
    url = endpoint + "indexes" + api_version
    response  = requests.post(url, headers=headers, json=index_schema)
    print(response.status_code)
    print(response.text)

In [7]:
delete_search_index(service_name, index_name, admin_key)

Index audios-rnv Deleted


In [8]:
create_search_index_rest(endpoint, api_version, headers, index_schema)

201
{"@odata.context":"https://search-michelesantana.search.windows.net/$metadata#indexes/$entity","@odata.etag":"\"0x8DA98DA47DA6E5D\"","name":"audios-rnv","defaultScoringProfile":null,"fields":[{"name":"nome_audio","type":"Edm.String","searchable":true,"filterable":true,"retrievable":true,"sortable":true,"facetable":true,"key":true,"indexAnalyzer":null,"searchAnalyzer":null,"analyzer":null,"synonymMaps":[]},{"name":"blob_location","type":"Edm.String","searchable":false,"filterable":false,"retrievable":true,"sortable":false,"facetable":false,"key":false,"indexAnalyzer":null,"searchAnalyzer":null,"analyzer":null,"synonymMaps":[]},{"name":"classificacao","type":"Collection(Edm.String)","searchable":true,"filterable":true,"retrievable":true,"sortable":false,"facetable":true,"key":false,"indexAnalyzer":null,"searchAnalyzer":null,"analyzer":null,"synonymMaps":[]},{"name":"protocolo","type":"Edm.String","searchable":true,"filterable":false,"retrievable":true,"sortable":false,"facetable":fal

## Create Azure Search JSON

Create the json document to populate de Azure Search Index

In [9]:
#De para das classes que o luis identificou para cada audio para as tags que serão pesquisadas no Search

classe_tags = {
    "classe_roubo" : ["roubo"],
    "classe_furto" : ["furto"],
    "classe_colisao" : ["colisão"],
    "classe_quebra_retrovisores" : ["quebra retrovisores"],
    "None" : ["nenhum", "não indentificado", "indefinido"]
}

In [10]:
def list_files(dir):
    '''Listar arquivos em um diretório específico no SO'''
    return [f for f in listdir(dir) if os.path.isfile(os.path.join(dir, f))]


def read_json_file(file_path):
    '''Ler arquivos sjon e retornar seu conteúdo'''
    with open(file_path, 'r') as json_file:
        return json.load(json_file)


Summarizing the transcriptions results

In [11]:

result_json = dict()
dir_transcricoes = "transcricoes"
for i in list_files("transcricoes"):
    print(i)
    json_data = read_json_file(os.path.join(dir_transcricoes,i))
    #getting only entities not empty
    entities = [tok['entities'][0] for tok in json_data['result'] if len(tok['entities']) > 0]
    #print(entities)
    #filtering just type and entity keys
    filtered_keys = [ "type", "entity"]
    entities_filtered = [dict((k, d[k]) for k in filtered_keys) for d in entities]
    #print(entities_filtered)
    intents_summarized = dict(Counter(tok['topScoringIntent']['intent'] for tok in json_data['result']) )
    #print(intents_summarized)
    #só as classes None ou classe_
    result_json[i]= {"classes" : {k:v for k,v in intents_summarized.items() if 'classe_' in k or 'None' in k},
                     "entities" : entities_filtered}
    print(result_json[i])

    #print(result_json[i])



id_1.json
{'classes': {'None': 23, 'classe_furto': 1}, 'entities': [{'type': 'placa', 'entity': 'fdz 5887'}, {'type': 'builtin.email', 'entity': 'maria@outlook.com'}, {'type': 'protocolo', 'entity': '540055484552'}]}
id_2.json
{'classes': {'None': 22, 'classe_colisao': 1}, 'entities': [{'type': 'placa', 'entity': 'bgm 2189'}, {'type': 'builtin.email', 'entity': 'mariana@hotmail.com'}, {'type': 'protocolo', 'entity': '54 30 4579 - 8758'}]}
id_3.json
{'classes': {'None': 22, 'classe_quebra_retrovisores': 1}, 'entities': [{'type': 'placa', 'entity': 'ght 5841'}, {'type': 'builtin.email', 'entity': 'joana@outlook.com'}, {'type': 'protocolo', 'entity': '540056487547'}]}
id_4.json
{'classes': {'None': 23}, 'entities': [{'type': 'placa', 'entity': 'ptv 8753'}, {'type': 'builtin.email', 'entity': 'luana@hotmail.com'}]}
id_5.json
{'classes': {'None': 22, 'classe_roubo': 1}, 'entities': [{'type': 'placa', 'entity': 'adr 2020'}, {'type': 'builtin.email', 'entity': 'julia@outlook.com'}, {'type': '

In [12]:
#Gerar o documento JSON com as informações do audio para subir no Search

def generate_json_search(json_data):
    json_search = {}
    json_search['value'] = []
    audio_data = {}
    for r in json_data.keys():
        tags = []
        #print(r)
        result = json_data[r].get("classes")
        #print(result)
        #print(result.get("classes"))
        classes = result.keys()
        #print(classes)
        #Se houver mais que uma classe retirar o None
        if (len(classes) > 1):
            result.pop("None")
            for c in classes:
                tags.extend(classe_tags[c])
            #print(tags)
        else:
            for c in classes:
                tags.extend(classe_tags[c])
            #print(tags)
            
        audio_name = r[:-5]

        audio_data = {
            "@search.action": "mergeOrUpload ",
            "nome_audio" : audio_name,
            "blob_location" : az_storage_uri + audio_name + ".wav" + az_storage_sas_token ,
            "classificacao" : tags         
        }

        for e in json_data[r].get("entities"):
            valor = e["entity"]
            if e["type"] == "protocolo":
                 valor = valor.replace(" ", "").replace("-","") 

            audio_data[e["type"].replace("builtin.email", "email")] = valor

        json_search['value'].append(audio_data)
    return json_search
    


In [13]:
json_search = generate_json_search(result_json)

In [14]:
json_search

{'value': [{'@search.action': 'mergeOrUpload ',
   'nome_audio': 'id_1',
   'blob_location': 'https://storagemichelesantana.dfs.core.windows.net/audios/id_1.wav?sv=2021-06-08&ss=bfqt&srt=sco&sp=rwdlacupiytfx&se=2022-10-10T22:05:30Z&st=2022-09-17T14:05:30Z&spr=https&sig=QeTjk9sgpgv0uEzDu%2By5ByQNsnDg5ZsAWEg65VV3rlM%3D',
   'classificacao': ['furto'],
   'placa': 'fdz 5887',
   'email': 'maria@outlook.com',
   'protocolo': '540055484552'},
  {'@search.action': 'mergeOrUpload ',
   'nome_audio': 'id_2',
   'blob_location': 'https://storagemichelesantana.dfs.core.windows.net/audios/id_2.wav?sv=2021-06-08&ss=bfqt&srt=sco&sp=rwdlacupiytfx&se=2022-10-10T22:05:30Z&st=2022-09-17T14:05:30Z&spr=https&sig=QeTjk9sgpgv0uEzDu%2By5ByQNsnDg5ZsAWEg65VV3rlM%3D',
   'classificacao': ['colisão'],
   'placa': 'bgm 2189',
   'email': 'mariana@hotmail.com',
   'protocolo': '543045798758'},
  {'@search.action': 'mergeOrUpload ',
   'nome_audio': 'id_3',
   'blob_location': 'https://storagemichelesantana.dfs.co

## Upload JSON documents on Azure Search

In [15]:
def upload_documents_rest(endpoint, headers, index_schema, json_search, api_version):
    '''Realizar o upload do json no Search'''
    url = endpoint + "indexes/"+index_name+"/docs/index" + api_version
    response  = requests.post(url, headers=headers, json=json_search)
    index_content = response.json()
    pprint(index_content)

In [16]:
upload_documents_rest(endpoint, headers, index_schema, json_search, api_version)

{'@odata.context': "https://search-michelesantana.search.windows.net/indexes('audios-rnv')/$metadata#Collection(Microsoft.Azure.Search.V2020_06_30.IndexResult)",
 'value': [{'errorMessage': None,
            'key': 'id_1',
            'status': True,
            'statusCode': 201},
           {'errorMessage': None,
            'key': 'id_2',
            'status': True,
            'statusCode': 201},
           {'errorMessage': None,
            'key': 'id_3',
            'status': True,
            'statusCode': 201},
           {'errorMessage': None,
            'key': 'id_4',
            'status': True,
            'statusCode': 201},
           {'errorMessage': None,
            'key': 'id_5',
            'status': True,
            'statusCode': 201}]}


To Test, go to azure portal on Azure Cognitivive Service and click on **Search Explorer**