# Disclaimer
By accessing this code, you acknowledge the code is made available for presentation and demonstration purposes only and that the code: (1) is not subject to SOC 1 and SOC 2 compliance audits; (2) is not designed or intended to be a substitute for the professional advice, diagnosis, treatment, or judgment of a certified financial services professional; (3) is not designed, intended or made available as a medical device; and (4) is not designed or intended to be a substitute for professional medical advice, diagnosis, treatment or judgement. Do not use this code to replace, substitute, or provide professional financial advice or judgment, or to replace, substitute or provide medical advice, diagnosis, treatment or judgement. You are solely responsible for ensuring the regulatory, legal, and/or contractual compliance of any use of the code, including obtaining any authorizations or consents, and any solution you choose to build that incorporates this code in whole or in part.

In [1]:
import webvtt
import os
import nltk
from nltk.corpus import stopwords
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import find
from scipy.spatial.distance import cosine
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from nltk.stem import WordNetLemmatizer 
import pandas as pd

In [2]:
# You only need to run this line once
lemmatizer = WordNetLemmatizer() 
nltk.download("stopwords")
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
transcripts_location = os.path.join(os.getcwd(),'transcripts')
transcripts_location

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/azureuser/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /home/azureuser/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/azureuser/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/azureuser/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


'/mnt/batch/tasks/shared/LS_root/mounts/clusters/mediademo/code/Users/demo-mediaindexer/transcripts'

In [3]:
# Define function for tokenizing documents
def tokenize(text):
    tokens = nltk.word_tokenize(text)
    # tags = nltk.pos_tag(text)
    stems = []
    for item in tokens:
        stems.append(nltk.PorterStemmer().stem(item))
        # lematized = lemmatizer.lemmatize(item)
    #print (stems)
    return stems

In [4]:
_,_,file_names = next(os.walk(transcripts_location))
transcript_files = {}
raw_text = []
file_index_mapping = {}
original_transcript = []
index = 0
for fname in file_names:
    transcript = os.path.join(transcripts_location,fname)
    text = '' 
    for captions in webvtt.read(transcript): 
        text = text + ' ' + captions.text
    words = text.split()
    transcript_files[fname] = words
    raw_text.append(words)
    original_transcript.append(text)
    file_index_mapping[index] = fname
    index = index+1

In [5]:
# Remove stopwords from the corpus
stopwords_english = stopwords.words('english')

for i in range(len(raw_text)):
    raw_text[i] = [word.lower() for word in raw_text[i]]
    raw_text[i] = [word for word in raw_text[i] if word not in string.punctuation]
    raw_text[i] = [word for word in raw_text[i] if word not in stopwords_english]
    corresponding_file = file_index_mapping[i]
    transcript_files[corresponding_file] = raw_text[i]
raw_text = [' '.join(i) for i in raw_text]
# text_df = text_df.rename(columns={0:"Transcript"})
# text_df['Name'] = transcript_files.keys()
# text_df = text_df.set_index('Name')

In [6]:
tfidf = TfidfVectorizer(tokenizer=tokenize)
video_tfidf = tfidf.fit_transform(raw_text)

In [7]:
cos_sim = cosine_similarity(video_tfidf, video_tfidf) 
cos_sim[1]

array([0.30085629, 1.        , 0.35033709, 0.32551995, 0.30104254,
       0.24232403, 0.23522395, 0.29497551, 0.29244448, 0.25126093,
       0.27271283, 0.29541464, 0.21819794, 0.        , 0.29569872,
       0.22034301, 0.20746902, 0.25595316, 0.29272317, 0.28113713])

In [8]:
cos_sim

array([[1.        , 0.30085629, 0.40716687, 0.37959107, 0.31846248,
        0.36651929, 0.28148613, 0.33283196, 0.31771419, 0.35003581,
        0.353921  , 0.32560239, 0.23050719, 0.        , 0.34291042,
        0.27034158, 0.23972531, 0.28328704, 0.35955563, 0.35430028],
       [0.30085629, 1.        , 0.35033709, 0.32551995, 0.30104254,
        0.24232403, 0.23522395, 0.29497551, 0.29244448, 0.25126093,
        0.27271283, 0.29541464, 0.21819794, 0.        , 0.29569872,
        0.22034301, 0.20746902, 0.25595316, 0.29272317, 0.28113713],
       [0.40716687, 0.35033709, 1.        , 0.66707715, 0.46774908,
        0.42825236, 0.38415916, 0.51462359, 0.467955  , 0.41069052,
        0.48326853, 0.39749981, 0.39957707, 0.        , 0.4728905 ,
        0.35522039, 0.38886387, 0.32923308, 0.4713265 , 0.60785704],
       [0.37959107, 0.32551995, 0.66707715, 1.        , 0.43404526,
        0.38231549, 0.33982094, 0.46730646, 0.41767856, 0.37570504,
        0.42138429, 0.33941949, 0.34488915, 0

In [9]:
def recomender(index,similarity = cos_sim,topk=3):
    # sims = pd.DataFrame(similarity[index])
    recommended = []
    inds = np.argsort(-1*cos_sim[index])[:topk+1]     
    for i in inds: 
        recommended.append(file_index_mapping[i])
    return recommended[1:]


In [10]:
id_to_search = 19    # Change this to change the video 
print("Video that is being used to recommend: ", file_index_mapping[id_to_search])
recomendations = recomender(id_to_search)
recomendations

Video that is being used to recommend:  What_is_Azure_Synapse_Analytics-_Generally_Available_Today..vtt


['Azure_Synapse_Analytics_-_Introduction_to_Azure_Purview.vtt',
 'Azure_Synapse_Analytics_Now_in_GA_and_the_Public_Preview_of_Azure_Purview.vtt',
 'Shape_Your_Future_with_Azure_Data_and_Analytics.vtt']

In [11]:
import pandas as pd 
pd.DataFrame(cos_sim).to_csv("similarity.csv")