# Disclaimer
By accessing this code, you acknowledge the code is made available for presentation and demonstration purposes only and that the code: (1) is not subject to SOC 1 and SOC 2 compliance audits; (2) is not designed or intended to be a substitute for the professional advice, diagnosis, treatment, or judgment of a certified financial services professional; (3) is not designed, intended or made available as a medical device; and (4) is not designed or intended to be a substitute for professional medical advice, diagnosis, treatment or judgement. Do not use this code to replace, substitute, or provide professional financial advice or judgment, or to replace, substitute or provide medical advice, diagnosis, treatment or judgement. You are solely responsible for ensuring the regulatory, legal, and/or contractual compliance of any use of the code, including obtaining any authorizations or consents, and any solution you choose to build that incorporates this code in whole or in part.

## Building a personalized recommender model

#### This notebook builds a video recommendation model using personalized profiles

### Importing required libraries

In [14]:
import os
import nltk
from nltk.corpus import stopwords
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import find
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import json
import pandas as pd
import json

### Downloading datasets for cleaning text data

In [15]:
nltk.download("stopwords")
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/azureuser/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/azureuser/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/azureuser/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/azureuser/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

### Defining folder paths to video files

In [16]:
persona_path= os.path.join(os.getcwd(),"personas")
persona_path = os.path.join(persona_path,"personas.json")
transcripts_location = os.path.join(os.getcwd(),'video_files')

### Defining function to tokenize video transcripts

In [17]:
def tokenize(text):
    tokens = nltk.word_tokenize(text)
    stems = []
    for item in tokens:
        stems.append(nltk.PorterStemmer().stem(item))
    return stems

### Indexing all files with our video data

In [18]:
_,_,file_names = next(os.walk(transcripts_location))
transcript_files = {}
raw_text = []
file_index_mapping = {}
original_transcript = []
ids = []

# Going over all files in the chosen directory
for fname in file_names:
    json_file = os.path.join(transcripts_location,fname)
    text = '' 
    
    with open(json_file) as f: 
        json_obj = json.load(f)

    text = json_obj['transcript']
    words = text.split()
    transcript_files[fname] = words
    raw_text.append(words)
    original_transcript.append(text)
    file_index_mapping[json_obj['video_id']] = fname
    ids.append(json_obj['video_id'])

### Filtering stop words from video transcript

In [19]:
stopwords_english = stopwords.words('english')

# Removing stop words from video transcripts
for i in range(len(ids)):
    raw_text[i] = [word.lower() for word in raw_text[i]]
    raw_text[i] = [word for word in raw_text[i] if word not in string.punctuation]
    raw_text[i] = [word for word in raw_text[i] if word not in stopwords_english]
    corresponding_file = file_index_mapping[ids[i]]
    transcript_files[corresponding_file] = raw_text[i]
raw_text = [' '.join(i) for i in raw_text]

### Extracting bag of words from persona files

In [20]:
personas = []
with open(persona_path) as persona_data: 
    persona_obj = json.load(persona_data)

    for i in persona_obj['items']: 
        personas.append(i)

personas[0]

{'id': '001',
 'name': 'Reta',
 'words': ['machine',
  'learning',
  'data',
  'model',
  'experiments',
  'algorithm',
  'classification',
  'learning',
  'analytic',
  'queries',
  'databases',
  'Azure',
  'Cosmos',
  'query',
  'function',
  'SQL',
  'distributed',
  'network',
  'vitual',
  'server']}

In [21]:
persona_words = []
for persona_obj in personas:
    words = ' '.join(persona_obj['words'])
    persona_words.append(words)

In [22]:
tfidf = TfidfVectorizer(tokenizer=tokenize)
video_tfidf = tfidf.fit_transform(raw_text)
persona_tfidf = tfidf.transform(persona_words)
cos_sim = cosine_similarity(persona_tfidf,video_tfidf)

### Defining function for fetching recommendations

In [23]:
def recommender(persona_index,similarity = cos_sim,topk=5):
    recommended = []
    inds = np.argsort(-1*cos_sim[persona_index])[:topk+1]     
    for i in inds: 
        recommended.append(ids[i])
    return recommended

### Recommendations for Reta

In [24]:
reta_recommendations_ids = recommender(0)
reta_recommendations_names = []

for i in reta_recommendations_ids: 
    fname = str(i)+".json"
    file_path = os.path.join(transcripts_location,fname)
    with open(file_path) as file_data: 
        json_obj = json.load(file_data)
    reta_recommendations_names.append(json_obj['name'])

reta_recommendations_names

['Azure Machine Learning service pipelines in Azure Data Factory.mp4',
 'Performance tuning and troubleshooting - Azure SQL Data Warehouse.mp4',
 'Introduction to Azure Cosmos DB Use Cases.mp4',
 'Why you should modernize to SQL Server 2017.mp4',
 'SQL Server 2017 for developers and machine learning.mp4',
 'Learn about Spark and Azure Cosmos DB integration and use case.mp4']

### Recommendations for Ryan

In [25]:
ryan_recommendations_ids = recommender(1)
ryan_recommendations_names = []

for i in ryan_recommendations_ids: 
    fname = str(i)+".json"
    file_path = os.path.join(transcripts_location,fname)
    with open(file_path) as file_data: 
        json_obj = json.load(file_data)
    ryan_recommendations_names.append(json_obj['name'])

ryan_recommendations_names

['Every second counts _ Microsoft In Culture.mp4',
 'Petersen Automotive Museum_ a HoloLens experience.mp4',
 'INCEPTION trailer.mp4',
 'Top Gun - Maverick.mp4',
 'Microsoft Rewards Ultimate Racing Experience.mp4',
 'Toyota Gazoo Racing.mp4']

### Exporting the model for later use

In [26]:
pd.DataFrame(cos_sim).to_csv('personalised_similarity.csv',header=ids,index=False)