# Project Week 1: ActivityNet Video Data Preparation and Indexing

In this example we will use the ActivityNet dataset https://github.com/activitynet/ActivityNet. 

 - Select the 10 videos with more moments.
 - Download these videos onto your computer.
 - Extract the frames for every video.
 - Read the textual descriptions of each video.
 - Index the video data in OpenSearch.

 In this week, you will index the video data and make it searchable with OpenSearch. You should refer to the OpenSearch tutorial laboratory.

## Imports

In [20]:
import json
import pprint as pp
from pprint import pprint

#import av
#import av.datasets

#Open Search
import requests

from opensearchpy import OpenSearch
from opensearchpy import helpers

## Select videos and Captions
Download the `activity_net.v1-3.min.json` file containing the list of videos. The file is in the github repository of ActivityNet.
Parse this file and select the 10 videos with more moments.

## Tratamento das Captions

In [21]:
def load_captions_data(file_path):
    """Carrega um ficheiro de captions e devolve um dicionário processado"""
    with open(file_path, 'r') as f:
        data = json.load(f)
    
    processed = {}
    for video_id, captions in data.items():
        processed[video_id] = {
            "segments": captions['segments'] if 'segments' in captions else captions,
        }
    return processed

# Load the data
val_data1 = load_captions_data('captions/val_1.json')
val_data2 = load_captions_data('captions/val_2.json')

# Combine dictionaries (preserving video_id as keys)
all_captions_data = {**val_data1, **val_data2}

pprint(f"Number of captions: {len(all_captions_data)}")
pprint(f"Example Captions: {all_captions_data}")

'Number of captions: 4917'
("Example Captions: {'v_uqiMw7tQ1Cc': {'segments': {'duration': 55.15, "
 "'timestamps': [[0, 4.14], [4.14, 33.36], [33.36, 55.15]], 'sentences': ['Two "
 'men both dressed in athletic gear are standing and talking in an indoor '
 "weight lifting gym filled with other equipment.', ' One man is holding onto "
 'a rope attached to a machine, and the other man instructs him to bend down '
 'on his left knee while still holding onto the rope and he showing the man '
 'how to have proper form.\', " The man then instructs the man holding the '
 'rope to pull the row down a few times and he\'s talking the whole time."]}}, '
 "'v_bXdq2zI1Ms0': {'segments': {'duration': 73.1, 'timestamps': [[6.94, "
 "69.08], [37.28, 43.49], [43.13, 55.55]], 'sentences': ['Three men are "
 "standing on a mat.', ' The man in front begins to do karate on the mat.', ' "
 "He gets down on the ground and flips around.']}}, 'v_FsS_NCZEfaI': "
 "{'segments': {'duration': 212.74, 'timestamps'

## Tratamento dos vídeos

In [22]:
with open('activity_net.v1-3.min.json', 'r') as json_data:
    data = json.load(json_data)

database = {}

for video_id in data['database']:
    database["v_" + video_id] = data['database'][video_id]

# Criar lista ordenada com todos os dados completos
sorted_database = sorted(
    database.items(),
    key=lambda x: len(x[1]['annotations']),
    reverse=True
)

# Top 10 vídeos (completo)
top_videos = dict(sorted_database[:27])

pprint(top_videos)

{'v_-ap649M020k': {'annotations': [{'label': 'Longboarding',
                                    'segment': [9.965381472401754,
                                                10.961919619641929]},
                                   {'label': 'Longboarding',
                                    'segment': [15.280251591016023,
                                                32.88575885892579]},
                                   {'label': 'Longboarding',
                                    'segment': [37.86844959512666,
                                                44.84421662580789]},
                                   {'label': 'Longboarding',
                                    'segment': [54.80959809820965,
                                                60.456647599237314]},
                                   {'label': 'Longboarding',
                                    'segment': [72.7472847485328,
                                                74.07600227818637]},
             

In [23]:
# Verifique quantos IDs do top_10 existem nas captions
matching_ids = set(database.keys()) & set(all_captions_data.keys())
print(f"Número de IDs correspondentes: {len(matching_ids)}")
print(f"IDs no top_videos: {list(top_videos.keys())[:5]}...")
print(f"IDs em all_captions_data: {list(all_captions_data.keys())[:5]}...")

Número de IDs correspondentes: 4917
IDs no top_videos: ['v_o1WPnnvs00I', 'v_oGwn4NUeoy8', 'v_VEDRmPt_-Ms', 'v_qF3EbR8y8go', 'v_DLJqhYP-C0k']...
IDs em all_captions_data: ['v_uqiMw7tQ1Cc', 'v_bXdq2zI1Ms0', 'v_FsS_NCZEfaI', 'v_K6Tm5xHkJ5c', 'v_4Lu8ECLHvK4']...


In [24]:
final_dataset_video = {}
final_dataset_captions = {}

for video_id in top_videos:
    try:
        if (all_captions_data[video_id] != None):
            final_dataset_video[video_id] = top_videos[video_id]
            final_dataset_captions[video_id] = all_captions_data[video_id]
    except Exception as e:
        None

final_dataset_video.pop("v_PJ72Yl0B1rY", None)
final_dataset_captions.pop("v_PJ72Yl0B1rY", None)

pprint(final_dataset_captions)
pprint(final_dataset_video)
print(len(final_dataset_video))
print(len(final_dataset_captions))

{'v_2ji02dSx1nM': {'segments': {'duration': 162.69,
                                'sentences': ['A surfer is riding on a surf '
                                              'board in the ocean.',
                                              ' He goes through the waves as '
                                              'they crash around him.',
                                              ' He continues riding the waves '
                                              'and talking to the camera in an '
                                              'interview.'],
                                'timestamps': [[0, 9.76],
                                               [18.71, 68.33],
                                               [82.97, 162.69]]}},
 'v_6gyD-Mte2ZM': {'segments': {'duration': 188.25,
                                'sentences': ["There's a man in a brown shirt "
                                              'bowling in a large alley in a '
                             

## Video frame extraction

PyAV is a wrapper library providing you access to `ffmpeg`, a command-line video processing tool. In the example below, you will be able to extract frames from the a video shot.

In [9]:
content = av.datasets.curated("pexels/time-lapse-video-of-night-sky-857195.mp4")
with av.open(content) as container:
    # Signal that we only want to look at keyframes.
    stream = container.streams.video[0]
    stream.codec_context.skip_frame = "NONKEY"

    for i, frame in enumerate(container.decode(stream)):
        print(frame)
        frame.to_image().save(f"night-sky.{i:04d}.jpg", quality=80)

NameError: name 'av' is not defined

## Video metadata

Process the video metadata provided in the `json` file and index the video data in OpenSearch.

In [25]:
#OpenSearch

host = 'api.novasearch.org'
port = 443

user = 'user09' # Add your user name here.
password = 'grupo09fct' # Add your user password here. For testing only. Don't store credentials in code. 
index_name = user

In [26]:
#Just to test if OpenSearch is up and running

# Create the client with SSL/TLS enabled, but hostname verification disabled.
client = OpenSearch(
    hosts = [{'host': host, 'port': port}],
    http_compress = True, # enables gzip compression for request bodies
    http_auth = (user, password),
    use_ssl = True,
    url_prefix = 'opensearch_v2',
    verify_certs = False,
    ssl_assert_hostname = False,
    ssl_show_warn = False
)

if client.indices.exists(index_name):

    resp = client.indices.open(index = index_name)
    print(resp)

    print('\n----------------------------------------------------------------------------------- INDEX SETTINGS')
    settings = client.indices.get_settings(index = index_name)
    pp.pprint(settings)

    print('\n----------------------------------------------------------------------------------- INDEX MAPPINGS')
    mappings = client.indices.get_mapping(index = index_name)
    pp.pprint(mappings)

    print('\n----------------------------------------------------------------------------------- INDEX #DOCs')
    print(client.count(index = index_name))
else:
    print("Index does not exist.")

{'acknowledged': True, 'shards_acknowledged': True}

----------------------------------------------------------------------------------- INDEX SETTINGS
{'user09': {'settings': {'index': {'creation_date': '1743960679413',
                                   'knn': 'true',
                                   'number_of_replicas': '0',
                                   'number_of_shards': '4',
                                   'provided_name': 'user09',
                                   'refresh_interval': '-1',
                                   'replication': {'type': 'DOCUMENT'},
                                   'uuid': 'DM4PfNq4TT24e0JVDesqeA',
                                   'version': {'created': '136387927'}}}}}

----------------------------------------------------------------------------------- INDEX MAPPINGS
{'user09': {'mappings': {'dynamic': 'strict',
                         'properties': {'annotations': {'type': 'flat_object'},
                                        'd

In [27]:
#index_name = 'user09_videos'

index_body = {
   "settings":{
      "index":{
         "number_of_replicas":0,
         "number_of_shards":4,
         "refresh_interval":"-1",
         "knn":"true"
      }
   },
   "mappings":{
       "dynamic":"strict",
       "properties":{
         "video_id":{
            "type":"keyword"
         },
         "annotations":{
            "type":"flat_object"
         },
         "duration":{
            "type":"float"
         },
         "url":{
            "type":"keyword"
         }
      }
   }
}

if client.indices.exists(index=index_name):
    print("Index already existed. Nothing to be done.")
else:        
    response = client.indices.create(index_name, body=index_body)
    print('\nCreating index:')
    print(response)

Index already existed. Nothing to be done.


In [28]:
client.indices.delete(index=index_name, ignore=[400, 404])
client.indices.create(index=index_name, body=index_body)

{'acknowledged': True, 'shards_acknowledged': True, 'index': 'user09'}

In [29]:
#Index the dataset of the videos

for video_id, data in final_dataset_video.items():
    filtered_video = {
        "video_id": video_id,
        "annotations": data["annotations"],
        "duration": data["duration"],
        "url": data["url"]
    }
    
    resp = client.index(index=index_name, id=video_id, body=filtered_video)
    print(resp['result'])

created
created
created
created
created
created
created
created
created
created


## Video captions

The ActivityNetCaptions dataset https://cs.stanford.edu/people/ranjaykrishna/densevid/ dataset provides a textual description of each videos. Index the video captions on a text field of your OpenSearch index.

In [15]:
index_name = 'user09_captions'

index_body = {
    "settings": {
        "index": {
            "number_of_replicas": 0,
            "number_of_shards": 4,
            "refresh_interval": "-1",
            "knn": "true"
        },
    },
    "mappings": {
        "dynamic": "strict",
        "properties": {
            "video_id": {
                "type": "keyword"
            },
            "duration": {
                "type": "float"
            },
            "sentences": {
                "type": "flat_object"
            },
            "timestamps": {
                "type": "flat_object"
            }
        }
    }
}

# Verificar se o índice já existe
if client.indices.exists(index=index_name):
    print("O índice de captions já existe. Nada a fazer.")
else:        
    response = client.indices.create(index_name, body=index_body)
    print('\nCriando índice de captions:')
    print(response)

AuthorizationException: AuthorizationException(403, '')

In [31]:
for video_id, data in final_dataset_captions.items():
    filtered_caption = {
        "video_id": video_id,
        "duration": data["segments"]["duration"],
        "sentences": data["segments"]["sentences"],
        "timestamps": data["segments"]["timestamps"]
    }
    
    resp = client.index(index=index_name, id=video_id, body=filtered_caption)
    print(resp['result'])

AuthorizationException: AuthorizationException(403, 'security_exception', 'no permissions for [indices:data/write/index] and User [name=user09, backend_roles=[own_index], requestedTenant=null]')

## Search Functionality

## Text-based Search 

In [None]:
qtxt = "How many videos is about Longboarding?"

query_bm25 = {
    "size": 5,
    "_source": ["video_id", "url"],  
    "query": {
      "match": {
        "annotations.label": "Skateboarding"
    }
}

response = client.search(
    body=query_bm25,
    index=index_name  
)

print("\nSearch results:")
pp.pprint(response)



Search results:
{'_shards': {'failed': 0, 'skipped': 0, 'successful': 4, 'total': 4},
 'hits': {'hits': [],
          'max_score': None,
          'total': {'relation': 'eq', 'value': 0}},
 'timed_out': False,
 'took': 2}
