# Datamap plot of Netflix movie descriptions

* Take the more than 7000 netflix movie descriptions
* Use an embedding model to embed these texts into multi dimensional arrays
* Reduce the dimension (with UMAP) to 2 for visualisation 
* Use the cool package datamapplot to do the visualisation

<a target="_blank" href="https://colab.research.google.com/github/longhowlam/python_hobby_stuff/blob/master/netflix_plot.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>

In [None]:
!pip install datamapplot
!pip install -U sentence-transformers
!pip install umap-learn

In [None]:
import datamapplot
import numpy as np
import pandas as pd

import tensorflow as tf
import tensorflow_hub as hub
from sentence_transformers import SentenceTransformer

import umap

## Use an embedding model
Could be for example the universal sentence encoder from tensorflow hub or the all-mpnet-base-v2 from huggingface

In [None]:
# Load the Universal Sentence Encoder module
module_url = "https://tfhub.dev/google/universal-sentence-encoder/4"
model = hub.load(module_url)

In [2]:
#model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
model = SentenceTransformer('all-mpnet-base-v2')

  return torch._C._cuda_getDeviceCount() > 0


## Get netflix data

In [2]:
netflix = pd.read_csv('netflix.csv')
## replace NaN in country with United States
netflix['COUNTRY'] = netflix['COUNTRY'].fillna('Unknown')
netflix.sample(5)

Unnamed: 0,SHOW_ID,TYPE,TITLE,DIRECTOR,CAST,COUNTRY,DATE_ADDED,RELEASE_YEAR,RATING,DURATION,LISTED_IN,DESCRIPTION
787,s788,Movie,Beat Bugs: All Together Now,Josh Wakely,"Ashleigh Ball, Lili Beaudoin, Charles Demers, ...","Australia, Canada","November 21, 2017",2017,TV-Y,51 min,"Children & Family Movies, Music & Musicals","After winning a local talent show, the Beat Bu..."
3142,s3143,Movie,Jeff Garlin: Our Man In Chicago,Christopher Storer,Jeff Garlin,United States,"November 12, 2019",2019,TV-MA,59 min,Stand-Up Comedy,Comedian Jeff Garlin (unintentionally) celebra...
3546,s3547,Movie,Latte and the Magic Waterstone,"Regina Welker, Nina Wels","Ashley Bornancin, Carter Hastings",Germany,"July 31, 2020",2020,TV-Y,83 min,"Children & Family Movies, Comedies",When a greedy bear steals a magic stone to kee...
1383,s1384,Movie,Chup Chup Ke,"Priyadarshan, Kookie V. Gulati","Shahid Kapoor, Kareena Kapoor, Om Puri, Neha D...",India,"November 1, 2018",2006,TV-PG,158 min,"Comedies, Dramas, International Movies","Mistaking a broke, small-time hustler for a mi..."
7497,s7498,Movie,We Are Legends,Daniel Yee Heng Chan,"Lam Yiu-sing, Ma Chi Wai, Wiyona Yeung, Eric K...",Hong Kong,"June 1, 2019",2019,TV-14,109 min,"Action & Adventure, International Movies, Spor...","Raised in a boxing gym, two orphaned brothers ..."


## Create embeddings

In [4]:
### embed the DESCRIPTION column
## if TF hub model 
## netflix_descr_embeddings = model(netflix['DESCRIPTION'])

netflix_descr_embeddings = model.encode(netflix['DESCRIPTION'])

In [6]:
## Save embeddings to pickle
## It took on my old laptop 9 minutes to compute, better save it :-)
## Or use a GPU......
np.save('netflix_descr_embeddings.npy', netflix_descr_embeddings)

In [8]:
netflix_descr_embeddings

array([[ 7.9707897e-05,  5.2250456e-02, -9.8019997e-03, ...,
        -2.2473235e-02,  5.1695641e-02,  1.7008994e-02],
       [-2.9466137e-02,  6.4157762e-02, -1.9443588e-02, ...,
         1.4587144e-02,  1.4151829e-02,  1.2219788e-02],
       [ 6.7223780e-02, -1.7172762e-03,  2.0036537e-02, ...,
         7.1134279e-03,  1.7426176e-02, -3.9884881e-03],
       ...,
       [ 5.1347680e-02,  2.5746586e-02, -9.5852194e-03, ...,
        -3.6992110e-02, -7.3561328e-03, -3.2956820e-02],
       [ 5.1170550e-02,  3.9280947e-02, -2.3238640e-02, ...,
        -2.5685078e-03,  3.8004894e-02, -1.1110178e-03],
       [ 2.5303083e-02,  6.3870158e-03,  2.1687828e-03, ...,
         2.3527676e-02, -2.4609808e-02, -1.1002290e-02]], dtype=float32)

## Reduce to two dimensions

In [9]:
#### Use Umap to reduce the dimensionality of the embeddings
reducer = umap.UMAP()
netflix_descr_embeddings_2d = reducer.fit_transform(netflix_descr_embeddings)
netflix_descr_embeddings_2d

array([[ 9.129803  , -3.40556   ],
       [10.503146  , -0.4326376 ],
       [10.156288  , -0.48341402],
       ...,
       [ 5.4245305 ,  0.5315926 ],
       [ 6.2143936 ,  2.1056442 ],
       [ 5.0171075 ,  0.54505473]], dtype=float32)

In [10]:
## Save the 2D embeddings to pickle
np.save('netflix_descr_embeddings_2d.npy', netflix_descr_embeddings_2d)

## Create (interactive) visualisations

In [4]:
### load the embeddings
netflix_descr_embeddings = np.load('netflix_descr_embeddings.npy')

### oad the 2D embeddings
netflix_descr_embeddings_2d = np.load('netflix_descr_embeddings_2d.npy')

### Interactive

In [11]:
plot = datamapplot.create_interactive_plot(
    netflix_descr_embeddings_2d,
    netflix['LISTED_IN'].values,
    hover_text=netflix['TITLE'].values,
    enable_search=True,
    min_fontsize=8,
    max_fontsize=8,
    histogram_data=netflix['RELEASE_YEAR'].values,
    histogram_n_bins=60,
    title="Nextflix movies",
    sub_title="A data map of netflix movies using embeddings on movie descriptions",
      histogram_settings={
         "histogram_title":"RELEASE YEAR",
        "histogram_width":600,
        "histogram_height":110,
    }
)
plot

In [9]:
plot.save("netflix_plot.html")