In [None]:
!pip install numpy scikit-learn seaborn

In [None]:
# Import the movie embeddings
import json
movie_embeddings = json.load(open("honey_i_shrunk_the_kids_movie_embeddings_1_second.json"))

In [None]:
# Get length of movie embeddings
print(len(movie_embeddings))

In [None]:
# Cleaning the opening credits
movie_embeddings = movie_embeddings[163:len(movie_embeddings)]

# Cleaning the closing credits
movie_embeddings = movie_embeddings[0:5222]

In [None]:
# Defined path to images
from IPython.display import Image, display
image_root = 'thumbnails_folder2large/'

# Iterate through the input list
import numpy as np
def euclidean_distance(array1, array2):
    array1_np = np.array(array1)
    array2_np = np.array(array2)
    distance = np.linalg.norm(array1_np - array2_np)
    return distance

target_index = 4000
target = movie_embeddings[target_index]
index_to_distance = []

for emb in movie_embeddings:
    current_dist = euclidean_distance(emb["embedding"], target["embedding"])
    index_to_distance.append(current_dist)

# Sort the index_to_distance array and keep track of the original indexes
sorted_indexes = np.argsort(index_to_distance)

In [None]:
# Using t-SNE to embed the vectors into 2D
from sklearn.manifold import TSNE
embeddings = np.array([vector['embedding'] for vector in movie_embeddings])
tsne = TSNE(n_components=2, random_state=42)
embedded_vectors = tsne.fit_transform(embeddings)

In [None]:
# Performing KMeans clustering with k=12
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=12, random_state=42)
clusters = kmeans.fit_predict(embedded_vectors)

# Extracting numbers from file names for labels
import re
labels = [re.search(r'\d+', vector['input']).group() for vector in movie_embeddings]

# Plotting the embedded vectors with cluster coloring
import seaborn as sns
import matplotlib.pyplot as plt
sns.set()
plt.figure(figsize=(12, 8))  # Adjust the figure size as needed
sns.scatterplot(x=embedded_vectors[:, 0], y=embedded_vectors[:, 1], hue=clusters, palette='bright', legend='full', s=100)
for i, vec in enumerate(embedded_vectors):
    plt.text(vec[0] + 0.02, vec[1] + 0.02, labels[i], fontsize=6)  # Adding labels
plt.xlabel('Dimension 1')
plt.ylabel('Dimension 2')
plt.title('t-SNE Embedded Vectors with KMeans Clustering (k=12)')
plt.legend(title='Cluster')
plt.show()

# Initialize 12 lists to store indexes for each cluster
cluster_indexes = [[] for _ in range(12)]

# Populate lists with indexes
for i, cluster in enumerate(clusters):
    cluster_indexes[cluster].append(i)

# Assign images to cluster indices
for i in range(0,12):
    print("cluster",i,len(cluster_indexes[i]))

# Create a list of cluster assignments for each vector
cluster_labels = [f'Cluster {cluster}' for cluster in clusters]

# Plotting the scatter plot
sns.set()
plt.figure(figsize=(12, 8))  # Adjust the figure size as needed
sns.scatterplot(x=np.arange(len(movie_embeddings)), y=cluster_labels, hue=cluster_labels, palette='bright', legend='full', s=100)
plt.xlabel('Index')
plt.ylabel('Cluster')
plt.title('Cluster Assignments of Vectors')
plt.legend(title='Cluster')
plt.show()

In [None]:
# Display an image from the cluster
print("Displaying Example Images from Cluster for Familiarization.")
for i in cluster_indexes[0][0:9]:
    image_path = image_root+movie_embeddings[i]["input"]
    display(Image(filename=image_path))

In [None]:
# Choosing a target frame for daughter
target_index = 1043
target = movie_embeddings[target_index]
index_to_Amy = []
image_path = image_root+movie_embeddings[target_index]["input"]
display(Image(filename=image_path))

# Finding index to distance of target frame
for emb in movie_embeddings:
    current_dist = euclidean_distance(emb["embedding"], target["embedding"])
    index_to_Amy.append(current_dist)

# Create a plot using Seaborn
sns.set(style="whitegrid")
plt.figure(figsize=(10, 6))
sns.lineplot(x=range(len(index_to_Amy)), y=index_to_Amy)
plt.xlabel("Index")
plt.ylabel("Distance")
plt.title("Distance from Target Over Film")
plt.show()

# Choosing a target frame for neighbor's older son
target_index = 1059
target = movie_embeddings[target_index]
index_to_Russ = []
image_path = image_root+movie_embeddings[target_index]["input"]
display(Image(filename=image_path))

# Finding index to distance of target frame
for emb in movie_embeddings:
    current_dist = euclidean_distance(emb["embedding"], target["embedding"])
    index_to_Russ.append(current_dist)

# Create a plot using Seaborn
sns.set(style="whitegrid")
plt.figure(figsize=(10, 6))
sns.lineplot(x=range(len(index_to_Russ)), y=index_to_Russ)
plt.xlabel("Index")
plt.ylabel("Distance")
plt.title("Distance from Target Over Film")
plt.show()

# Choosing a target frame for neighbor's younger son
target_index = 1062
target = movie_embeddings[target_index]
index_to_Ron = []
image_path = image_root+movie_embeddings[target_index]["input"]
display(Image(filename=image_path))

# Finding index to distance of target frame
for emb in movie_embeddings:
    current_dist = euclidean_distance(emb["embedding"], target["embedding"])
    index_to_Ron.append(current_dist)

# Create a plot using Seaborn
sns.set(style="whitegrid")
plt.figure(figsize=(10, 6))
sns.lineplot(x=range(len(index_to_Ron)), y=index_to_Ron)
plt.xlabel("Index")
plt.ylabel("Distance")
plt.title("Distance from Target Over Film")
plt.show()

# Choosing a target frame for inventor's son
target_index = 627
target = movie_embeddings[target_index]
index_to_Nick = []
image_path = image_root+movie_embeddings[target_index]["input"]
display(Image(filename=image_path))

# Finding index to distance of target frame
for emb in movie_embeddings:
    current_dist = euclidean_distance(emb["embedding"], target["embedding"])
    index_to_Nick.append(current_dist)

# Create a plot using Seaborn
sns.set(style="whitegrid")
plt.figure(figsize=(10, 6))
sns.lineplot(x=range(len(index_to_Nick)), y=index_to_Nick)
plt.xlabel("Index")
plt.ylabel("Distance")
plt.title("Distance from Target Over Film")
plt.show()

# Finding lowest average distance
import statistics
average_to_Amy = statistics.mean(index_to_Amy)
average_to_Russ = statistics.mean(index_to_Russ)
average_to_Ron = statistics.mean(index_to_Ron)
average_to_Nick = statistics.mean(index_to_Nick)
print(f"Average to Amy: {average_to_Amy}")
print(f"Average to Russ: {average_to_Russ}")
print(f"Average to Ron: {average_to_Ron}")
print(f"Average to Nick: {average_to_Nick}")

In [None]:
# Analyzing how often the dog appears

# Checking to make sure we have the right image
print("Target Frame Printed Below")
target_index = 1469
image_path = image_root+movie_embeddings[target_index]["input"]
display(Image(filename=image_path))

# Filling indexes with distances
target = movie_embeddings[target_index]
index_to_distance = []
for emb in movie_embeddings:
    current_dist = euclidean_distance(emb["embedding"], target["embedding"])
    index_to_distance.append(current_dist)

# Sort the index_to_distance array and keep track of the original indexes
sorted_indexes = np.argsort(index_to_distance)

# Determining the new number of clusters
number_of_clusters = 36

# Convert using tSNE
embeddings = np.array([vector['embedding'] for vector in movie_embeddings])
tsne = TSNE(n_components=2, random_state=42)
embedded_vectors = tsne.fit_transform(embeddings)

# Performing KMeans clustering with k=36
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=number_of_clusters, random_state=42)
clusters = kmeans.fit_predict(embedded_vectors)

# Extracting numbers from file names for labels
labels = [re.search(r'\d+', vector['input']).group() for vector in movie_embeddings]

# Initialize 36 lists to store indexes for each cluster
cluster_indexes = [[] for _ in range(number_of_clusters)]

# Populate lists with indexes
for i, cluster in enumerate(clusters):
    cluster_indexes[cluster].append(i)

# Assign images to cluster indices
for i in range(0,number_of_clusters):
    print(f"Cluster {i} contains {len(cluster_indexes[i])} images.")

# Create a list of cluster assignments for each vector
cluster_labels = [f'Cluster {cluster}' for cluster in clusters]

# Printing a portion of dog cluster
print("Example Images from Dog Cluster Below")
for i in cluster_indexes[15][0:10]:
    image_path = image_root+movie_embeddings[i]["input"]
    display(Image(filename=image_path))

# Showing length of dog cluster
print(f"Cluster 15 is the dog cluster, and he appears on screen for about {len(cluster_indexes[15])} seconds.")

# Displaying the scatterplot to see the dog's appearances over time
sns.set()
plt.figure(figsize=(12, 8))
sns.scatterplot(x=np.arange(len(movie_embeddings)), y=cluster_labels, palette='bright', hue=cluster_labels, legend='full', s=100)
plt.xlabel('Index')
plt.ylabel('Cluster')
plt.title('Cluster Assignments of Vectors')
plt.legend(title='Cluster')
plt.show()

# Displaying the clustergraph to see where the dogs location was in the earlier clustergraph
sns.set()
plt.figure(figsize=(12, 8))  # Adjust the figure size as needed
sns.scatterplot(x=embedded_vectors[:, 0], y=embedded_vectors[:, 1], palette='bright', hue=clusters, legend='full', s=100)
for i, vec in enumerate(embedded_vectors):
    plt.text(vec[0] + 0.02, vec[1] + 0.02, labels[i], fontsize=6)  # Adding labels
plt.xlabel('Dimension 1')
plt.ylabel('Dimension 2')
plt.title('t-SNE Embedded Vectors with KMeans Clustering (k=36)')
plt.legend(title='Cluster')
plt.show()

In [None]:
# Finding yard scenes
print("Ten Example Images Are Shown Below From The Last Yard Cluster.")
for i in cluster_indexes[33][0:9]:
    image_path = image_root+movie_embeddings[i]["input"]
    display(Image(filename=image_path))

# Cluster 0 is one-half a yard cluster
yard_clusters_length = round(len(cluster_indexes[0])/2,0)

# Clusters 18, 22, and 28 are 80% a yard cluster
yard_clusters_length = yard_clusters_length + round(len(cluster_indexes[18])/5*4,0)
yard_clusters_length = yard_clusters_length + round(len(cluster_indexes[22])/5*4,0)
yard_clusters_length = yard_clusters_length + round(len(cluster_indexes[28])/5*4,0)

# Cluster 3, 4, 8, 9, 10, 17, 23, 24, 29, 30, 33 are yard clusters
yard_clusters_length += len(cluster_indexes[3])
yard_clusters_length += len(cluster_indexes[4])
yard_clusters_length += len(cluster_indexes[8])
yard_clusters_length += len(cluster_indexes[9])
yard_clusters_length += len(cluster_indexes[10])
yard_clusters_length += len(cluster_indexes[17])
yard_clusters_length += len(cluster_indexes[23])
yard_clusters_length += len(cluster_indexes[24])
yard_clusters_length += len(cluster_indexes[29])
yard_clusters_length += len(cluster_indexes[30])
yard_clusters_length += len(cluster_indexes[33])

# Cluster 27 contains only a few frames of the shrunken children in the yard, so it's omitted

# Printing total time in yard
print(f"The children are in the yard for about {yard_clusters_length} seconds.")

# Film Description
Our group went with the default choice of "Honey, I Shrunk the Kids." The 1989 movie centers around two sets of siblings who are shrunk by the inventor father's latest invention. One set of children is the inventor's children, Amy and Nick, and the other children are the neighbor's children, Russ and Ron. The children run away from what would be mild dangers for adults but are serious dangers due to their size such as mud puddles and small scorpions. They use whatever they can to give themselves an advantage such a Lego block shelters, ants as transportation, and their dog's amazing hearing ability to finally find and signal to the inventor father who is able to save them.

# Methods Summary
# This section should highlight methods you used in your exploratory analysis. You should include at least one clustering technique or develop another way to relate frames to other frames. You should also consider dimensionality reduction.
We began by removing the opening portion and credits. These have drastically different coloration and would throw off the distances. We kept some of the provided code, so we could see how information changed over time from an arbitrary frame and gained a better understanding of which clusters to explore more thoroughly. For finding the character who appears most often, we tried to take a target frame that centered on each character and found the one with the lowest average distance which turned out to be Ron. Then, we increased the number of clusters until we came across a dog-centered cluster. While there were about five frames containing science images, there were also about five frames throughout the other clusters where the dog was represented. Plots are shown with the new clusters to see when the dog appears in the film temporally and to see how the original clusters changed around the dog. Then, to find the number of images that centered around the children's adventure through the yard, we used the same clusters to see which ones were yard-focused and summed them together.

# Hunches and Hypotheses
# This section should summarize the questions that you asked about the film that could potentially be answered by exploratory analysis. You should ask at least three questions.
1. What images appear most often on screen whether they be characters, settings, or anything else?
   H. Since the movie centers around the idea of children being shrunk and finding their way out, they would collectively be the most common image.
2. How often do we actually see the dog?
   H. The dog probably isn't seen as much as you would think. He is a tool for gags, so we'll see him eating a giant bone or as a plot device, but he otherwise won't come up.
3. How long do the children spend adventuring through the yard while shrunken?
   H. It's probably about half the movie. The yard prevents a series of dangers, and we already know the children tame an ant to get back to the house, so it's unlikely to not account for a lot of the film.

# Results and Interpretation
# This section should include a summary of your findings. Describe the extent and results of your goal in answering questions.
1. There is very little difference in how often the children are shown on screen according to the average distance method. Ron does appear to be seen slightly more than the other children.
2. The dog appears sporadically throughout the film, but he appears up front a bit more which may make him seem more prevalent than he is. All total, he only appears for a sum of about two minutes. He appears to exist as a device to shift the mood. The default cluster that was composed of scientific inventions and the dog has now broken into three separate clusters. This includes the new dog cluster and two scientific clusters, one for the inventor doing science and one for the devices.
3. The children are in the yard for 1,978 seconds or just under 33 minutes. While it doesn't account for more than half of the film, it does account for the second act.

# Reflection
# Reflect on your process of analysis. What worked well and did not work well? Describe the limitations of the work and describe what you would work on with more time.
Several attempts were made to approach the problems differently. We tried to work with a single-shot detection model in TensorFlow to detect dogs and certain individuals, but there were many configuration issues. An attempt was also made at taking a clipping of the dog target image as an ROI and searching images closer to that, but with the limited number of clusters at the time, they separated more along average color than scene. Also, we tried to sub-divide already divided clusters. All of these techniques didn't work well. They were either too hard to implement given the time constraints, or they didn't provide good answers.

Dividing the original embeddings into more clusters worked the best. While this method still seems to divide along average coloration, average coloration serves as a very good proxy for scenes and themes. One cluster even managed to capture the scientist's family's daughter in scenes where she had been soaked. (No one had thought to guess that the coloration of people changes that dramatically when they are wet.)

If we had more time, we would have trained a model to use single-shot detection to capture various items throughout the movie to answer the questions such as the Lego block, ant, dog, and children. Also, we would have spent more time seeing how average distance to a target picture is influenced over a movie to make sure that methodology is correct or how it could be improved.