# Ingest MSR-VTT Dataset
In this notebook, we are converting metadata extracted from the MSR-VTT dataset into a newml suitable format. The videos and metadata can be found in the following location
https://github.com/crux82/msr-vtt-it

#### How to download the video files?
Video files were download manually to a local computer from this [url](https://github.com/crux82/msr-vtt-it/tree/master/msr-vtt-it) and then uploaded to an Azure Blob Storage 


### Read the JSON file - containing all the annotations for the videos

In [None]:
"""
Copyright (c) Microsoft Corporation.
Licensed under the MIT license.
"""
import json
  
# Opening JSON file
f = open('./common/notebooks/dataset_ingestion/msr_vtt/msr_vtt_raw_data/train_val_videodatainfo.json')
# returns JSON object as a dictionary
data = json.load(f)

## Read Category File - converts annotated category id into a string representation

In [None]:
import pandas as pd
category = open('./common/notebooks/dataset_ingestion/msr_vtt/msr_vtt_raw_data/category.txt').read()
category_df = pd.DataFrame([x.split('\t') for x in category.split('\n')])
category_df.columns = ['categ_name','categ_id']
category_df['categ_id'] = category_df['categ_id'].astype(str)
category_df.head()

## Extract video information and annotations

In [None]:
videos = data['videos']
metadata = data['sentences']
df = pd.DataFrame(metadata)

In [None]:
videos = pd.DataFrame(videos)
videos['category'] = videos['category'].astype('str')
videos = videos.merge(category_df, left_on = 'category',right_on='categ_id')

## Since the annotation file is missing video creation dates, we will fake them using the last 2 year period

In [None]:
import datetime
import random

# fake creation dates to avoid going to Youtube API
def random_date(start, end):
    """Generate a random datetime between `start` and `end`"""
    return start + datetime.timedelta(
        # Get a random amount of seconds between `start` and `end`
        seconds=random.randint(0, int((end - start).total_seconds())),
    )

In [None]:
# merge all annotation for a single video as a long sentence
full_video_description = df.groupby('video_id')['caption'].apply('. '.join).reset_index()
# videos contain multiple languages, so we will use auto to detect languages
full_video_description['video_languages'] = "Various Languages"
full_video_description['video_languages_code'] = 'auto'
# Now let's fake the creation date, each video will have only 1 version so creation date == current version date
full_video_description['first_creation_date'] = full_video_description['video_id'].apply(lambda x: random_date(pd.to_datetime('2020-01-01'),pd.to_datetime('2022-07-20')))
full_video_description['current_version_creation_date'] = full_video_description['first_creation_date']
# No restrictions on usage terms
full_video_description['usage_terms'] = 'Mictosoft Research - no restrictions'

full_video_description['matching_video_name'] = full_video_description['video_id'].apply(lambda x: x+'.mp4')
full_video_description['data_source'] = 'msr-vtt'
full_video_description['video_description'] = full_video_description['caption']
full_video_description['version'] = 1

## Merge all the information together

In [None]:
data = full_video_description.merge(videos, left_on = 'video_id', right_on='video_id')

In [None]:
data = data[['first_creation_date','current_version_creation_date','video_id','video_description','video_languages','usage_terms','matching_video_name','video_languages_code','data_source','url','categ_name']]
data = data.rename(columns = {'categ_name':'topics'})

## Parse Categories into a list of topics (requirement of MetadataParser)

In [None]:
# add the youtube url 
data['video_description'] = data.apply(lambda row: 'video shows: ' + row['video_description'] +'. video_url:' + row['url'], axis=1)

data['keywords'] = data['topics'].apply(lambda x: x.split('/'))

## Remove all videos showing children

In [None]:
data = data[~data['video_description'].str.contains('child|kid|boy|girl', na=False)]

## Convert DataFrame into a list of dictionaries and save them as JSON files

In [None]:
data['first_creation_date'] = data['first_creation_date'].astype(str)
data['current_version_creation_date'] = data['current_version_creation_date'].astype(str)

In [None]:
from common.enrichment.xml_parser.metadata_parser import MetadataParser
from azure.storage.blob import BlobServiceClient
from datetime import datetime
from dotenv import load_dotenv
import os

load_dotenv()

# setting up variables for connection and source to the source storage acount/container
source_account_key = os.getenv('SOURCE_ACCOUNT_STORAGE_KEY')
storage_account = os.getenv('SOURCE_ACCOUNT_STORAGE_NAME')
container_name = 'msr-vtt'
source_connection_string = f'DefaultEndpointsProtocol=https;AccountName={storage_account};AccountKey={source_account_key};EndpointSuffix=core.windows.net'
source_service = BlobServiceClient.from_connection_string(conn_str=source_connection_string)


# setting up variables for connection and source to the source storage acount/container
target_account_key = os.getenv('WALDO_UPLOAD_STORAGE_KEY')
target_storage_account = os.getenv('WALDO_STORAGE_ACCOUNT_NAME')
target_container_name = os.getenv('WALDO_CONTAINER_NAME')
target_connection_string = f'DefaultEndpointsProtocol=https;AccountName={target_storage_account};AccountKey={target_account_key};EndpointSuffix=core.windows.net'
target_service = BlobServiceClient.from_connection_string(conn_str=target_connection_string)

source_container_client = source_service.get_container_client(container_name)
target_container_client = target_service.get_container_client(target_container_name)
bronze_container_client = target_service.get_container_client('bronze')


In [None]:


records = data.to_dict('records')
source = 'msr-vtt'
for record in records:

    # parse metadata file 
    parser  = MetadataParser()
    parsed = parser.parse_metadata(record)

    # upload metadata file to directory in upload
    new_blob_name = f"{source}/{parsed['file_name']}/{parsed['file_name']}.json"
    print(new_blob_name)
    blob_client = target_container_client.get_blob_client(new_blob_name)
    try:
        blob_client.upload_blob(json.dumps(parsed), overwrite=True)
    except Exception as e:
        print(e)
        continue
    
    # Move the video file to a new location
    video_blob  =  bronze_container_client.get_blob_client(f"MP4/{parsed['matching_video_name']}")
    print(video_blob.url)
    target_blob_name = f"{source}/{parsed['file_name']}/{parsed['matching_video_name']}"
    target_video_blob =  target_container_client.get_blob_client(target_blob_name) 
    try:
        target_video_blob.start_copy_from_url(video_blob.url)
    except Exception as e:
        print(e)
        continue
