In [None]:
"""
Copyright (c) Microsoft Corporation.
Licensed under the MIT license.
"""
from enrichment.entity_extractor.entity_extractor import EntityExtractor, EntityTypeConfig
import enrichment.entity_extractor.utils as entity_extractor_utils
from enrichment.metadata_parser.json_reader import json_reader
from enrichment.metadata_parser.metadata_parser import MetadataParser
from azure.storage.blob import BlobServiceClient


In [None]:
# Preparation: loading the files in blob storage

storage_account_name = 'datastoragestg9ki9'
# key can be grabbed from https://portal.azure.com/#@waldosearch.onmicrosoft.com/resource/subscriptions/26ca8759-8cc8-49b7-9d4e-5ade3345bc0d/resourceGroups/rg-waldo-stg-9ki9-core/providers/Microsoft.Storage/storageAccounts/datastoragestg9ki9/keys
account_storage_key = '' #insert storage key
container_name = 'upload'

connection_string = f"DefaultEndpointsProtocol=https;AccountName={storage_account_name};AccountKey={account_storage_key};EndpointSuffix=core.windows.net"
service = BlobServiceClient.from_connection_string(conn_str=connection_string)
container_client = service.get_container_client(container_name)

all_files = list(container_client.list_blobs(name_starts_with='msr-vtt'))
metadata_files = [x for x in all_files if 'json' in x['name']]

In [None]:
# Let's analyze, for example, the first file
file_name = metadata_files[0]['name']
data = json_reader(container_client, file_name)
parser = MetadataParser()
parsed_data = parser.parse_metadata(data)
parsed_data

# Now, parsed_data is the parsed JSON file that will be the input to the NER client.

In [None]:
# Let's build and initialize the EntityExtractor object. 
ner_extractor = EntityExtractor()
endpoint='https://eastus.api.cognitive.microsoft.com/'
# key can be grabbed from: https://portal.azure.com/#@waldosearch.onmicrosoft.com/resource/subscriptions/26ca8759-8cc8-49b7-9d4e-5ade3345bc0d/resourceGroups/rg-waldo-stg-9ki9-ml/providers/Microsoft.CognitiveServices/accounts/textanalytics-stg-9ki9/cskeys
key = '' # Insert key
ner_extractor.initialize_client(endpoint, key)

In [None]:
# Each type of named entity needs to be configured separately.

# For locations, we will be using a threshold of 0.9 as the minimum confidence for a named entity to be considered,
# the elements in the video_locations field of the xml will be added unconditionally to the set of locations,
# and elements that are substrings of others will be removed (e.g., "Zurich" and "Zurich, Switzerland" --> only "Zurich, Switzerland" will be left)
locations_ner_config = EntityTypeConfig('Location', 
                                        threshold=0.9, 
                                        add_from_xml=['video_locations'],
                                        remove_substrings=True)

# For people, we will also use a threshold of 0.85,
# and remove elements that are substrings of others (e.g., remove the family name if we already have the given and family names)
people_ner_config = EntityTypeConfig('Person',
                                    threshold=0.85,
                                    remove_substrings=True)

# For organizations, we do similarly than we did for people, but adding the field 'company_names' inconditionally.
organizations_ner_config = EntityTypeConfig('Organization',
                                           threshold=0.85,
                                           add_from_xml=['company_names'],
                                           remove_substrings=True)

ner_extractor.initialize_entity_type_configs([locations_ner_config, people_ner_config, organizations_ner_config])

In [None]:
# Now we can perform the named entity recognition.
# Please note that the function can receive more parameters, but in this case we're using their default values.
ner_extractor.extract_entities(parsed_data)