<a href="https://colab.research.google.com/github/nikolajvester/7th_semester/blob/main/Group/Assignment_3/NHN_SBERT_Netflix_Recommender.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [None]:
# Pip installs
!pip install sentence-transformers -q
!pip install gradio -q

In [None]:
# Imports
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer, util
import torch
from sentence_transformers.util import cos_sim

model = SentenceTransformer('bert-base-nli-mean-tokens')

In [None]:
# Importing the dataset
df_netflix = pd.read_csv('https://raw.githubusercontent.com/Korsholm22/M4_Group_Assignments/main/Group_Assignment_3/Data/netflix_titles.csv')

In [None]:
# Examining the dataset
df_netflix.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


# Preprocessing and Feature Engineering

In [None]:
# To be deleted - just to reduce runtime on corpus_embedding for test purposes
df_netflix = df_netflix.iloc[:100]

In [None]:
# Checking the dataset for NaN values
df_netflix.isna().sum()

show_id          0
type             0
title            0
director        38
cast            11
country         40
date_added       0
release_year     0
rating           0
duration         0
listed_in        0
description      0
dtype: int64

In [None]:
# Dropping NaN values based on columns we are interested in for modeling
df_netflix = df_netflix[df_netflix['cast'].notna()]

In [None]:
# Merging type, title, cast, listed_in and description into one column to include more information in the semantic search
df_netflix['information'] = df_netflix['listed_in'].str.cat(df_netflix['description'], sep =". ")
df_netflix['information'] = df_netflix['title'].str.cat(df_netflix['information'], sep =". ")

# Checking if the merge is succesfull
df_netflix['information'][1]

'Blood & Water. International TV Shows, TV Dramas, TV Mysteries. After crossing paths at a party, a Cape Town teen sets out to prove whether a private-school swimming star is her sister who was abducted at birth.'

In [None]:
# Creating 5 search examples to examine if the model seems to work
search_examples = ["Action movie taking place in space",
                    "Sad movie where the dog dies",  
                    "Documentray about turtles and plastic straws",       
                    "Funny movie with Kevin Hart and the Rock",    
                    "True crime tv show where the detective turns out to be the villain"]

In [None]:
# Embedding the search examples
embeddings = model.encode(search_examples)

embeddings.shape

(5, 768)

In [None]:
# Converting the information to a list
show_information = df_netflix['information'].values.tolist()

In [None]:
# Checking that the conversion is succesfull
show_information[0:5]

['Blood & Water. International TV Shows, TV Dramas, TV Mysteries. After crossing paths at a party, a Cape Town teen sets out to prove whether a private-school swimming star is her sister who was abducted at birth.',
 'Ganglands. Crime TV Shows, International TV Shows, TV Action & Adventure. To protect his family from a powerful drug lord, skilled thief Mehdi and his expert team of robbers are pulled into a violent and deadly turf war.',
 'Kota Factory. International TV Shows, Romantic TV Shows, TV Comedies. In a city of coaching centers known to train India’s finest collegiate minds, an earnest but unexceptional student and his friends navigate campus life.',
 'Midnight Mass. TV Dramas, TV Horror, TV Mysteries. The arrival of a charismatic young priest brings glorious miracles, ominous mysteries and renewed religious fervor to a dying town desperate to believe.',
 "My Little Pony: A New Generation. Children & Family Movies. Equestria's divided. But a bright-eyed hero believes Earth Pon

In [None]:
# Embedding the show information and converting it into tensors
corpus_embeddings = model.encode(show_information, convert_to_tensor=True)

In [None]:
# Find the closest 5 sentences of the corpus for each query sentence based on cosine similarity
top_k = min(5, len(show_information))
for query in search_examples:
    query_embedding = model.encode(query, convert_to_tensor=True)

    # We use cosine-similarity and torch.topk to find the highest 5 scores
    cos_scores = util.cos_sim(query_embedding, corpus_embeddings)[0]
    top_results = torch.topk(cos_scores, k=top_k)

    print("\n\n======================\n\n")
    print("Query:", query)
    print("\nTop 5 most similar sentences in corpus:")

    for score, idx in zip(top_results[0], top_results[1]):
        print(show_information[idx], "(Score: {:.4f})".format(score))

    """
    # Alternatively, we can also use util.semantic_search to perform cosine similarty + topk
    hits = util.semantic_search(query_embedding, corpus_embeddings, top_k=5)
    hits = hits[0]      #Get the hits for the first query
    for hit in hits:
        print(corpus[hit['corpus_id']], "(Score: {:.4f})".format(hit['score']))
    """





Query: Action movie taking place in space

Top 5 most similar sentences in corpus:
A StoryBots Space Adventure. Children & Family Movies. Join the StoryBots and the space travelers of the historic Inspiration4 mission as they search for answers to kids' questions about space. (Score: 0.5906)
Into the Night. International TV Shows, TV Dramas, TV Mysteries. Passengers and crew aboard a hijacked overnight flight scramble to outrace the sun as a mysterious cosmic event wreaks havoc on the world below. (Score: 0.5875)
Naruto the Movie 3: Guardians of the Crescent Moon Kingdom. Action & Adventure, Anime Features, International Movies. Exuberant ninja Naruto teams up with his pals Sakura and Kakashi to escort Prince Michiru and his son, Hikaru, to the Crescent Moon kingdom. (Score: 0.4073)
Dark Skies. Horror Movies, Sci-Fi & Fantasy. A family’s idyllic suburban life shatters when an alien force invades their home, and as they struggle to convince others of the deadly threat. (Score: 0.399

In [None]:
def query_corpus(query):
  results = []
  query_embedding = model.encode(query, convert_to_tensor=True)
  cos_scores = util.cos_sim(query_embedding, corpus_embeddings)[0]
  top_results = torch.topk(cos_scores, k=top_k)

  for score, idx in zip(top_results[0], top_results[1]):
    results.append(
        (show_information[idx], 
         score))
    output = results[0][0]
    score = float(results[0][1].numpy())
  return output, score

In [None]:
query_corpus('Action movie with batman')

('Naruto the Movie 3: Guardians of the Crescent Moon Kingdom. Action & Adventure, Anime Features, International Movies. Exuberant ninja Naruto teams up with his pals Sakura and Kakashi to escort Prince Michiru and his son, Hikaru, to the Crescent Moon kingdom.',
 0.4268164038658142)

In [None]:
# Converting the information to a list
title = df_netflix['title'].values.tolist()

# Converting the information to a list
show_type = df_netflix['type'].values.tolist()

# Converting the information to a list
genre = df_netflix['listed_in'].values.tolist()

# Converting the information to a list
description = df_netflix['description'].values.tolist()

In [None]:
def query_corpus(query):
  results = []
  query_embedding = model.encode(query, convert_to_tensor=True)
  cos_scores = util.cos_sim(query_embedding, corpus_embeddings)[0]
  top_results = torch.topk(cos_scores, k=top_k)
  for score, idx in zip(top_results[0], top_results[1]):
    results.append((
        title[idx], 
        score))
    output = results[0][0]
    score = float(results[0][1].numpy())
  return output, score

In [None]:
query_corpus('Action movie with batman')

('Naruto the Movie 3: Guardians of the Crescent Moon Kingdom',
 0.4268164038658142)

In [None]:
import gradio as gr

gr.Interface(fn=query_corpus, inputs=["text"], outputs=["text", "label"]).launch()

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Note: opening Chrome Inspector may crash demo inside Colab notebooks.

To create a public link, set `share=True` in `launch()`.


<IPython.core.display.Javascript object>

