# Insights Combiner

#### When searching for an interesting insight, we want to be able to combine all our enrichments into a single document
#### In this notebook we will go over the steps to combine the various insights

In [None]:
"""
Copyright (c) Microsoft Corporation.
Licensed under the MIT license.
"""
from azure.storage.blob import BlobServiceClient
from enrichment.metadata_parser.json_reader import json_reader
import json
import pandas as pd


storage_account_name = 'datastoragestg9ki9'
# key can be grabbed from https://portal.azure.com/#@waldosearch.onmicrosoft.com/resource/subscriptions/26ca8759-8cc8-49b7-9d4e-5ade3345bc0d/resourceGroups/rg-waldo-stg-9ki9-core/providers/Microsoft.Storage/storageAccounts/datastoragestg9ki9/keys
account_storage_key = '' #insert storage key
container_name = 'upload'

storage_meta = dict()
storage_meta['account_name'] = storage_account_name
storage_meta['account_storage_key'] = account_storage_key
storage_meta['container_name'] = container_name

# connect to storage container
connection_string = f"DefaultEndpointsProtocol=https;AccountName={storage_account_name};AccountKey={account_storage_key};EndpointSuffix=core.windows.net"
service = BlobServiceClient.from_connection_string(conn_str=connection_string)
container_client = service.get_container_client(container_name)

In [None]:
# Get all files and storage and filter json files
all_files = list(container_client.list_blobs(name_starts_with = 'msr-vtt'))
json_files = list(filter(lambda file: file['name'].endswith('json'),all_files))

#### Read and parse a JSON file

In [None]:
# Read the JSON file
%store -r data
file_to_read = 'msr-vtt/video1001/video1001.json'
data = json_reader(container_client, file_to_read)


In [None]:
from enrichment.metadata_parser.metadata_parser import MetadataParser
parser  = MetadataParser()
parsed_data = parser.parse_metadata(data)

#### Read and parse Video Indexer insights for that file

In [None]:
import json
from enrichment.vi_insights_parser.vi_insights_parser import ViInsightsParser

vi_insights_file = 'common/notebooks/demo/insights_combiner/vi_insights.json'
vi_parser = ViInsightsParser()
insights = vi_parser.load_vi_insights(vi_insights_file)
parsed_vi_insights = vi_parser.parse_vi_insights(insights)

#### Extract NER for this file

In [None]:
from enrichment.entity_extractor.entity_extractor import EntityExtractor, EntityTypeConfig
import enrichment.entity_extractor.utils as entity_extractor_utils

# Let's build and initialize the EntityExtractor object. 
ner_extractor = EntityExtractor()
endpoint='https://eastus.api.cognitive.microsoft.com/'
# key can be grabbed from: https://portal.azure.com/#@waldosearch.onmicrosoft.com/resource/subscriptions/26ca8759-8cc8-49b7-9d4e-5ade3345bc0d/resourceGroups/rg-waldo-stg-9ki9-ml/providers/Microsoft.CognitiveServices/accounts/textanalytics-stg-9ki9/cskeys
key = '' # Insert key
ner_extractor.initialize_client(endpoint, key)

In [None]:
# Each type of named entity needs to be configured separately.

# For locations, we will be using a threshold of 0.9 as the minimum confidence for a named entity to be considered,
# the elements in the video_locations field of the xml will be added unconditionally to the set of locations.
# and elements that are substrings of others will be removed (e.g., "Zurich" and "Zurich, Switzerland" --> only "Zurich, Switzerland" will be left)
locations_ner_config = EntityTypeConfig('Location', 
                                        threshold=0.9, 
                                        add_from_xml=['video_locations'],
                                        remove_substrings=True)

# For people, we will also use a threshold of 0.85,
# and remove elements that are substrings of others (e.g., remove the family name if we already have the given and family names)
people_ner_config = EntityTypeConfig('Person',
                                    threshold=0.85,
                                    remove_substrings=True)

# For organizations, we do similarly than we did for people, but adding the field 'company_names' inconditionally.
organizations_ner_config = EntityTypeConfig('Organization',
                                           threshold=0.85,
                                           add_from_xml=['company_names'],
                                           remove_substrings=True)

ner_extractor.initialize_entity_type_configs([locations_ner_config, people_ner_config, organizations_ner_config])

In [None]:
ner_insights = ner_extractor.extract_entities(parsed_data)
ner_insights

## Now let's combine all insights into the final document

In [None]:
from enrichment.insights_combiner.insights_combiner import InsightsCombiner
combiner = InsightsCombiner()
final_doc = combiner.combine_insights(vi_insights=parsed_vi_insights,
                                        metadata = parsed_data,
                                        ner_insights = ner_insights)

In [None]:
final_doc