# Disclaimer
By accessing this code, you acknowledge the code is made available for presentation and demonstration purposes only and that the code: (1) is not subject to SOC 1 and SOC 2 compliance audits; (2) is not designed or intended to be a substitute for the professional advice, diagnosis, treatment, or judgment of a certified financial services professional; (3) is not designed, intended or made available as a medical device; and (4) is not designed or intended to be a substitute for professional medical advice, diagnosis, treatment or judgement. Do not use this code to replace, substitute, or provide professional financial advice or judgment, or to replace, substitute or provide medical advice, diagnosis, treatment or judgement. You are solely responsible for ensuring the regulatory, legal, and/or contractual compliance of any use of the code, including obtaining any authorizations or consents, and any solution you choose to build that incorporates this code in whole or in part.

# Data Processing - Video Recommendation Service
<h3><span style="color: #117d30;">This notebook is a tool that pre-processes videos for a video recommendation service</span></h3>


![](https://#STORAGE_ACCOUNT_NAME#.blob.core.windows.net/images/banner_image.jpg)

## Importing required libraries

In [19]:
import numpy as np 
import pandas as pd

StatementMeta(Media, 67, 19, Submitted, Available)



## Azure Synapse Link

Reading data from Cosmos DB analytical store into a Spark DataFrame via Azure Synapse Link


In [20]:
df = spark.read\
    .format("cosmos.olap")\
    .option("spark.synapse.linkedService", "MediaCosmosDb")\
    .option("spark.cosmos.container", "videoindexerinsights")\
    .load()

indexer = df.toPandas()
insights_ids = indexer['id']
display(df.limit(10))

StatementMeta(Media, 67, 20, Finished, Available)

SynapseWidget(Synapse.DataFrame, 3013902a-830f-4f3c-8141-dfc57be53138)

In [21]:
transcript_df = spark.read\
    .format("cosmos.olap")\
    .option("spark.synapse.linkedService", "MediaCosmosDb")\
    .option("spark.cosmos.container", "transcript")\
    .load()

transcript_pd = transcript_df.toPandas()
video_ids = np.unique(np.array(transcript_pd['videoid']))
transcript_pd.columns

StatementMeta(Media, 67, 21, Finished, Available)

Index(['_rid', '_ts', 'adjustedStart', 'adjustedEnd', 'start', 'end', 'id',
       'text', 'confidence', 'speakerId', 'language', 'videoid', '_etag'],
      dtype='object')

## Data Extraction and Cleaning

Extracting transcripts from each video


In [22]:
extracted_text = []
video_id = []
ks = []
transcripts = []
not_present = []

# Looping over all entries in the dataframe

for video in video_ids: 
    files = transcript_pd.loc[transcript_pd['videoid'] == video]
    files = files.sort_values(by=['id'])
    text = ' '.join(files['text'])
    extracted_text.append(text)

for i in indexer.index: 
    record = indexer.iloc[i]
    vid_id = record['videos'][0]['id']
    video_id.append(vid_id)
    try: 
        index  = video_ids.tolist().index(vid_id)
        transcripts.append(extracted_text[index])
    except:
        not_present.append(vid_id)
        transcripts.append('') 

indexer['video_id'] = video_id
indexer['transcript'] = transcripts

StatementMeta(Media, 67, 22, Finished, Available)



In [23]:
insights_ids['videoid'] = indexer['id']
insights_ids = pd.DataFrame(insights_ids)
ts_id = pd.DataFrame(video_ids,columns=['videoid'])
m = insights_ids.merge(ts_id, on='videoid', how='outer', suffixes=['', '_'], indicator=True)
m[m["_merge"] != 'both']['id']

StatementMeta(Media, 67, 23, Finished, Available)

KeyError: 'videoid'

Adding links for video player and insights


In [None]:
# Url formats for video player and insights for each video
player_url_format = "https://www.videoindexer.ai/embed/player/"
insights_url_format = "https://www.videoindexer.ai/embed/insights/"
end_format = "/?locale=en&location=westus2"
player_url = []
insights_url = []

# Looping over all entires and adding video and insights url to each video

for i in indexer.index: 
    record = indexer.loc[i]
    vid_id = record['videos'][0]['id']
    acc_id = record['accountId']

    record_player_url = player_url_format+acc_id+'/'+vid_id+'/'+end_format
    record_insights_url = insights_url_format+acc_id+'/'+vid_id+'/'+end_format

    player_url.append(record_player_url)
    insights_url.append(record_insights_url)

indexer['insights_url'] = insights_url
indexer['player_url'] = player_url


StatementMeta(, , , Cancelled, )

Extracting topics from videos


In [None]:
vids = indexer['videos']

tp = indexer['videos'][0][0]['insights']['topics']

topics_name = []
topics_iabname = []
for i in range(len(indexer['videos'])): 
    try: 
        record_topics = vids[i][0]['insights']['topics']
        tmp_name = []
        tmp_iab = []
        for j in range(len(record_topics)): 

            tmp_name.append(record_topics[j]['name'])
            tmp_iab.append(record_topics[j]['iabName'])
    except:
        topics_name.append(tmp_name)
        topics_iabname.append(tmp_iab)
        continue    
    topics_name.append(tmp_name)
    topics_iabname.append(tmp_iab)


indexer['topics_name'] = topics_name
indexer['topics_iabname'] = topics_iabname

StatementMeta(, , , Cancelled, )

Preparing data with relevant columns for upload


In [None]:
columns_to_upload = ["video_id","accountId","transcript","name","player_url","insights_url","topics_name","topics_iabname"]
final_df = indexer[indexer.columns & columns_to_upload]
final_df.shape

StatementMeta(, , , Cancelled, )

## Data Upload
Uploading feature engineered JSON files to a container for further use



In [None]:
from azure.storage.blob import ContainerClient
import json

CONNECTION_STRING = "DefaultEndpointsProtocol=https;AccountName=#STORAGE_ACCOUNT_NAME#;AccountKey=#STORAGE_ACCOUNT_KEY#;EndpointSuffix=core.windows.net"
CONTAINER_NAME = "finalmediademostorage"

container_client = ContainerClient.from_connection_string(conn_str=CONNECTION_STRING, container_name=CONTAINER_NAME)

for i in final_df.index: 
    record = final_df.iloc[i]
    js_data = record.to_json()

    fname = record['video_id']+'.json'
    blob_client = container_client.upload_blob(name=fname,data=js_data,overwrite=True)



StatementMeta(, , , Cancelled, )

In [None]:
personas = {
    "items" : [{
    
    "id" : "1",
    "name": "reta",
    "words": ["machine","learning","data","model","experiments","algorithm","classification","learning","analytic","queries","databases","Azure","Cosmos","query","function","SQL","distributed","network","vitual","server"]
    },
    {
        "id": "2",
        "name":"ryan",
        "words":["car","racing","auto","Ferrari","Ford","lap","race","dream","inception","chaos","mission","agency","rogue","combat","mysteries","shoot","planes"]

    }
    
    ]
} 
CONNECTION_STRING = "DefaultEndpointsProtocol=https;AccountName=#STORAGE_ACCOUNT_NAME#;AccountKey=#STORAGE_ACCOUNT_KEY#;EndpointSuffix=core.windows.net"
CONTAINER_NAME = "recommendermodel"

container_client = ContainerClient.from_connection_string(conn_str=CONNECTION_STRING, container_name=CONTAINER_NAME)

blob_client = container_client.upload_blob(name="personas.json",data=json.dumps(personas),overwrite=True)

StatementMeta(, , , Cancelled, )