In [None]:
pip install -U sentence-transformers
pip install fiftyone
pip install torch torchvision umap-learn
pip install "ipywidgets>=8.0, <9"

In [1]:
import cv2
import numpy as np
import pandas as pd
import requests
import pickle
import urllib.request 

import fiftyone.brain as fob
import fiftyone as fo
import fiftyone.zoo as foz
from sentence_transformers import SentenceTransformer, util

## form dataframe with tmdb desc, poster, and script text

In [None]:
import os

In [242]:
df = pd.read_csv('..\\database\\dataset_film_scripts\\springfield_movie_scripts_2023_01_13_clean.csv', index_col=[0])
df = df[df['tmdb_id'] > 0]

dff = pd.read_csv('..\\database\\dataset_tmdb\\df_tmdb_description.csv', index_col=[0])

poster_list = os.listdir('..\\database\\dataset_tmdb\\posters')

df_spaces = pd.read_csv('..\\database\\dataset_spaces_upload.csv', index_col=[0])
df_spaces = df_spaces.drop_duplicates(subset='imdb_id', keep=False)
df_spaces = df_spaces[['imdb_id', 'genre', 'average_rating', 'num_votes']]

In [243]:
# drop all duplicates from df and dff
df = df.drop_duplicates(subset=['tmdb_id'], keep=False)
dff = dff.drop_duplicates(subset=['tmdb_id'], keep=False)

dff = dff[~dff['tmdb_description'].isna()]

In [244]:
# find subset of tmdb_ids that are in posters, desc, and script
tmdb_id_posters = [int(j[5:-4]) for j in poster_list]
tmdb_id_desc = list(dff['tmdb_id'])
tmdb_id_script = list(df['tmdb_id'])

# find tmdb ids that have poster, tmdb description, and film script
tmdb_id_combined = [j for j in tmdb_id_desc if j in tmdb_id_posters]
tmdb_id_combined = [j for j in tmdb_id_combined if j in tmdb_id_script]
len(tmdb_id_combined)

32173

In [245]:
# drop rows that are not in tmdb_id_combined
dff = dff[dff['tmdb_id'].isin(tmdb_id_combined)]
df = df[df['tmdb_id'].isin(tmdb_id_combined)]


In [246]:
poster_list = [poster_list[j] for j in range(len(tmdb_id_posters)) if tmdb_id_posters[j] in tmdb_id_combined]
tmdb_id_posters = [int(j[5:-4]) for j in poster_list]
df_posters = pd.DataFrame(list(zip(tmdb_id_posters, poster_list)), columns=['tmdb_id', 'poster_img'])

In [318]:
df_embedding = df[['imdb_id','tmdb_id','movie_title', 'movie_year', 'script_text']].copy()
df_embedding = df_embedding.join(dff.set_index('tmdb_id'), on='tmdb_id', how='left')
df_embedding = df_embedding.join(df_posters.set_index('tmdb_id'), on='tmdb_id', how='left')
df_embedding = df_embedding.join(df_spaces.set_index('imdb_id'), on='imdb_id', how='left')

df_embedding.to_csv('df_embeddings.csv')

# build embeddings for tmdb description text

In [356]:
model = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')

In [258]:
# will take ~10 minutes to embed all tmdb descriptions
embeddings_tmdb_desc = [model.encode(jscript) for jscript in df_embedding['tmdb_description']]

# print("Similarity:", util.dot_score(query_embedding, passage_embedding))

# build embeddings for script text

In [357]:
# will take ~2 hours to embed all scripts
embeddings_scripts = [model.encode(jscript) for jscript in df_embedding['script_text']]

In [363]:
genres = [j.split(',') for j in df_embedding['genre']]
genre = []
for j in genres:
    for k in j:
        if(k not in genre):
            genre.append(k)

In [364]:
genre

['Romance',
 'Comedy',
 'Drama',
 'Thriller',
 'Documentary',
 'Biography',
 'Music',
 'Horror',
 'War',
 'Family',
 'Western',
 'Adventure',
 'Crime',
 'Animation',
 'Mystery',
 'Fantasy',
 'Musical',
 'Sci-Fi',
 'Action',
 'Film-Noir',
 'Short',
 'History',
 'Sport',
 'Adult',
 '\\N',
 'Talk-Show',
 'News',
 'NA',
 'Game-Show',
 'Reality-TV']

# build embeddings for tmdb poster data

In [321]:
df_embedding.head()

Unnamed: 0,imdb_id,tmdb_id,movie_title,movie_year,script_text,tmdb_description,poster_img,genre,average_rating,num_votes
0,tt10919164,472886,A 2nd Hand Lover,2015,Wow! Amazing! What a beautiful place. Im meet...,"Ajay Rao is an aspiring rockstar, who comes to...",tmdb_472886.png,Romance,4.7,6.0
1,tt5684466,372399,A Aa,2016,"3 My every birthday, Mom hopes for something....","The love affair, conspired by creation, betwee...",tmdb_372399.png,"Comedy,Drama,Romance",6.8,3310.0
2,tt15331880,938971,A Baby at Any Cost,2022,"0 Jen? Jen? Jen? Oops... sorry! Jason? Oh, sh...",When a surrogate grows too fond of the baby sh...,tmdb_938971.png,Thriller,5.6,147.0
3,tt5212918,438424,A Bad Idea Gone Wrong,2017,"1 - The tropical places, you know, thats the ...",Two would-be thieves forge a surprising relati...,tmdb_438424.png,Comedy,5.8,1246.0
4,tt6359956,431530,A Bad Moms Christmas,2017,"1 My name is Amy Mitchell, and this year Ive ...","Amy, Kiki and Carla – three under-appreciated ...",tmdb_431530.png,Comedy,5.6,54137.0


In [365]:
df_embedding['average_rating'] = df_embedding['average_rating'].fillna(-1)
df_embedding['num_votes'] = df_embedding['num_votes'].fillna(-1)
df_embedding['genre'] = df_embedding['genre'].fillna('NA')

In [366]:
data_path = "C:\\Users\\Nick\\Documents\\DataScience\\movie_recsys\\database\\dataset_tmdb\\posters\\"

In [367]:
dataset = fo.Dataset()
for j in range(len(df_embedding)):
    file_str = data_path + df_embedding.iloc[j]['poster_img']
    sample = fo.Sample(filepath=file_str)
    dataset.add_sample(sample)
    # sample['id'] = df_embedding.iloc[j]['tmdb_id']
    sample['genre'] = df_embedding.iloc[j]['genre'].split(',')
    sample['movie_year'] = df_embedding.iloc[j]['movie_year']
    sample['movie_title'] = df_embedding.iloc[j]['movie_title']
    sample['average_rating'] = df_embedding.iloc[j]['average_rating']
    sample['num_votes'] = df_embedding.iloc[j]['num_votes']
    sample.save()
    if(j%1000 == 0):
        print(j)

0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000


In [379]:
datatemp = dataset

In [380]:
# Load a resnet from the model zoo
model = foz.load_zoo_model("resnet50-imagenet-torch")

# Verify that the model exposes embeddings
print(model.has_embeddings)
# True

# Compute embeddings for each image
embeddings = datatemp.compute_embeddings(model)
print(embeddings.shape)
# 10000 x 2048




True
  45% |█████|-------| 14437/32173 [1.1h elapsed, 1.5h remaining, 2.7 samples/s]    

In [373]:
32000/4/3600

2.2222222222222223

In [376]:
# Compute 2D representation using pre-computed embeddings
results = fob.compute_visualization(
    datatemp,
    embeddings=embeddings_tmdb_desc[0:1000],
    num_dims=3,
    brain_key="word_embeddings_tmdb",
    verbose=True,
    seed=51,
)

# Compute 2D representation using pre-computed embeddings
results = fob.compute_visualization(
    datatemp,
    embeddings=embeddings_scripts[0:1000],
    num_dims=3,
    brain_key="word_embeddings_scripts",
    verbose=True,
    seed=51,
)


# Compute 2D representation using pre-computed embeddings
results = fob.compute_visualization(
    datatemp,
    embeddings=embeddings[0:1000],
    num_dims=3,
    brain_key="image_embeddings",
    verbose=True,
    seed=51,
)



Generating visualization...
UMAP(n_components=3, random_state=51, verbose=True)
Sat Oct 21 21:41:39 2023 Construct fuzzy simplicial set



n_jobs value -1 overridden to 1 by setting random_state. Use no seed for parallelism.



Sat Oct 21 21:41:39 2023 Finding Nearest Neighbors
Sat Oct 21 21:41:39 2023 Finished Nearest Neighbor Search
Sat Oct 21 21:41:39 2023 Construct embedding


Epochs completed:   0%|            0/500 [00:00]

	completed  0  /  500 epochs
	completed  50  /  500 epochs
	completed  100  /  500 epochs
	completed  150  /  500 epochs
	completed  200  /  500 epochs
	completed  250  /  500 epochs
	completed  300  /  500 epochs
	completed  350  /  500 epochs
	completed  400  /  500 epochs
	completed  450  /  500 epochs
Sat Oct 21 21:41:41 2023 Finished embedding
Generating visualization...
UMAP(n_components=3, random_state=51, verbose=True)
Sat Oct 21 21:41:41 2023 Construct fuzzy simplicial set



n_jobs value -1 overridden to 1 by setting random_state. Use no seed for parallelism.



Sat Oct 21 21:41:41 2023 Finding Nearest Neighbors
Sat Oct 21 21:41:41 2023 Finished Nearest Neighbor Search
Sat Oct 21 21:41:41 2023 Construct embedding


Epochs completed:   0%|            0/500 [00:00]

	completed  0  /  500 epochs
	completed  50  /  500 epochs
	completed  100  /  500 epochs
	completed  150  /  500 epochs
	completed  200  /  500 epochs
	completed  250  /  500 epochs
	completed  300  /  500 epochs
	completed  350  /  500 epochs
	completed  400  /  500 epochs
	completed  450  /  500 epochs
Sat Oct 21 21:41:43 2023 Finished embedding
Generating visualization...
UMAP(n_components=3, random_state=51, verbose=True)
Sat Oct 21 21:41:43 2023 Construct fuzzy simplicial set



n_jobs value -1 overridden to 1 by setting random_state. Use no seed for parallelism.



Sat Oct 21 21:41:43 2023 Finding Nearest Neighbors
Sat Oct 21 21:41:43 2023 Finished Nearest Neighbor Search
Sat Oct 21 21:41:43 2023 Construct embedding


Epochs completed:   0%|            0/500 [00:00]

	completed  0  /  500 epochs
	completed  50  /  500 epochs
	completed  100  /  500 epochs
	completed  150  /  500 epochs
	completed  200  /  500 epochs
	completed  250  /  500 epochs
	completed  300  /  500 epochs
	completed  350  /  500 epochs
	completed  400  /  500 epochs
	completed  450  /  500 epochs
Sat Oct 21 21:41:45 2023 Finished embedding


In [None]:
session = fo.launch_app(datatemp)

plot = results.visualize(
    labels="ground_truth.label",
    labels_title="time of day",
    axis_equal=True,
)


# Attach plot to session
# session.plots.attach(plot)

