# Image Frame extraction from video
- Once downloaded the first episode is read in and every 5th frame is saved to disc.
-

In [1]:
import os
import cv2
import pandas as pd
from PIL import Image

from datasets import load_dataset
from torch.utils.data import DataLoader
from transformers import ViTFeatureExtractor, ViTModel
import torch
from torchvision import transforms
import torchvision
import numpy as np
import timm
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

# Set plot parameters
plt.rcParams["figure.figsize"] = (10, 6)
plt.rcParams['axes.grid'] = False
plt.style.use('dark_background')
plt.rcParams["axes.spines.top"] = False
plt.rcParams["axes.spines.right"] = False
plt.rcParams["font.size"] = 13

import ast

In [2]:
df = pd.read_csv("./data/csvs/metadata_pond-rotifers.csv")
cindex = 3 # got 4 already
file_name = df.loc[cindex, "file_name"]  # did 4
res = tuple(ast.literal_eval(df.loc[cindex, "resolution"]))
df.head(8)


Unnamed: 0,file_path,file_name,creation,duration,resolution,fps,desc
0,./data/videos/pond-water-rotifers/2020_0107_01...,2020_0107_010256_004.MP4,1706962000.0,49.93,"[1920, 1080]",30.0,
1,./data/videos/pond-water-rotifers/2020_0109_03...,2020_0109_035310_014.MP4,1706962000.0,128.23,"[1920, 1080]",30.0,
2,./data/videos/pond-water-rotifers/2020_0109_06...,2020_0109_060802_030.MP4,1706962000.0,64.17,"[1920, 1080]",30.0,
3,./data/videos/pond-water-rotifers/2024_0104_12...,2024_0104_121454_057.MP4,1706962000.0,100.8,"[1920, 1080]",30.0,
4,./data/videos/pond-water-rotifers/2024_0109_11...,2024_0109_113533_051.MP4,1706962000.0,134.13,"[2880, 2160]",24.0,
5,./data/videos/pond-water-rotifers/2024_0116_12...,2024_0116_120310_060.MP4,1706962000.0,77.12,"[1920, 1080]",60.0,
6,./data/videos/pond-water-rotifers/2024_0116_12...,2024_0116_121239_064.MP4,1706962000.0,217.8,"[1920, 1080]",60.0,
7,./data/videos/pond-water-rotifers/2024_0116_15...,2024_0116_150020_071.MP4,1706962000.0,205.52,"[1920, 1080]",60.0,


In [None]:
fpath = f"/Users/mikehemberger/Documents/vscode/rotifer-locomotion/data/videos/pond-water-rotifers/{file_name}"
os.makedirs(f'/Users/mikehemberger/Documents/vscode/rotifer-locomotion/data/images/{file_name[:-4]}/', exist_ok=True)
video = cv2.VideoCapture(fpath)
frame_counter = 0

while True:
    ret, frame = video.read()
    if not ret:
        break
    if frame_counter % 1 == 0:
        cv2.imwrite(f'/Users/mikehemberger/Documents/vscode/rotifer-locomotion/data/images/{file_name[:-4]}/frame{str(frame_counter).zfill(4)}.jpg', frame)  
    frame_counter += 1
video.release()


In [None]:
# Make them 224x224 first!
original_size = res

cdir = f"/Users/mikehemberger/Documents/vscode/rotifer-locomotion/data/images/{file_name[:-4]}/"
os.makedirs(os.path.join(cdir, "images-224x224"), exist_ok=True)

ori_imgs = [f for f in os.listdir(cdir) if f.endswith(".jpg")]
target_imgpaths = [os.path.join(cdir, "images-224x224", f.replace(".jpg", "-224x224.jpg")) for f in ori_imgs]

#ori_imgs = [
for orig, targ in zip(ori_imgs, target_imgpaths):
    oim = Image.open(os.path.join(cdir, orig)).resize((224,224))
    oim.save(targ)

In [None]:

torch.set_grad_enabled(False)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


img_size = (224,224)
norm_tf = transforms.Normalize(mean=[0.485, 0.456, 0.406],  # maybe adjust this for each network?
                                        std=[0.229, 0.224, 0.225])

tfs = transforms.Compose([
    transforms.Resize(img_size),
    transforms.ToTensor(),
    norm_tf])

ds = torchvision.datasets.ImageFolder(root=f"./data/images/{file_name[:-4]}/")
ds.transform = tfs
print(ds)

dataloader = DataLoader(ds, batch_size=6, num_workers=4, shuffle=False)

vision_model = "google/vit-base-patch16-224-in21k"
bs_vm, nw_vm = 6, 6

# MODEL
#vit_feature_extractor = ViTFeatureExtractor.from_pretrained(vision_model)
#vit_model = ViTModel.from_pretrained(vision_model, return_dict=True)
#vit_model.to(device)

# "vit_base_patch16_384.augreg2_in21k_ft_in1k"
model = timm.create_model("vit_base_patch16_224_in21k", pretrained=True, num_classes=0, global_pool="avg").eval()
model.to(device)

# FEATURES
features = list()
for nth, (images, labels) in enumerate(dataloader):
    images, labels = images.to(device), labels.to(device)
    with torch.no_grad():
        outputs = model(images)

    features.append(outputs.detach().cpu().numpy())
    #print(f"Processing batch #{nth + 1} / {total_batches}")

    #features.append(outputs.pooler_output.detach().cpu().numpy())

# Save feature vectors
feature_vectors = np.concatenate(features)

In [None]:
feature_vectors.shape

In [None]:

# T-SNE Setup with equal parameters
tsne = TSNE(n_components=2, perplexity=50, early_exaggeration=150, learning_rate=45, random_state=42, init="pca")  # 150, 200, 250

embed = tsne.fit_transform(feature_vectors)  


In [None]:
plt.figure(figsize=(8,8))
plt.scatter(embed[:, 0], embed[:, 1], alpha=.4, edgecolors="none", s=20)
plt.axis("square")

In [None]:
# clustering
time_label = np.arange(0, embed.shape[0], 1)

n_clusters = 3
kmeans = KMeans(n_clusters=n_clusters, init='k-means++', random_state=42)
kmeans.fit(embed)
clusters = kmeans.labels_

unique_cluster_labels = np.unique(clusters)
print(unique_cluster_labels)

cmap = plt.get_cmap('bwr')
colors = [cmap(i) for i in np.linspace(0, 1, len(unique_cluster_labels))]

plt.figure(figsize=(7,7))

for nth, cluster in enumerate(unique_cluster_labels):
    idx = clusters == cluster
    plt.scatter(embed[idx, 0], embed[idx, 1], marker="o", s=5, edgecolors="None", c=colors[nth], alpha=.75)

#plt.title(f"Text Features")

plt.axis("equal")
plt.xlabel("tSNE dim 1")
plt.ylabel("tSNE dim 2")
plt.tight_layout()
plt.savefig(f"./{file_name}_tSNE_nclusters_{n_clusters}.jpg")
# probably use tochvision_save_image (tensors)

In [None]:
clusters
cluster_indices = dict()

for nth, cluster in enumerate(unique_cluster_labels):
    idx = np.where(clusters == cluster)[0]
    cluster_indices[nth] = idx

cluster_indices

In [None]:

tfs = transforms.Compose([
    transforms.Resize(img_size),
    transforms.ToTensor()])

ds.transform = tfs

cclust = cluster_indices[0]

for k, v in cluster_indices.items():
    l = list()
    for nth in v:
        l.append(ds.__getitem__(nth)[0])

    slist = torch.stack(l)
    torchvision.utils.save_image(slist, fp=f"./{file_name}_cluster_{k}.jpg", nrow=20)

#slist.shape

In [None]:
# from sklearn.cluster import DBSCAN
# db = DBSCAN(eps=5.75, min_samples=50).fit(features_embedded)  # You can adjust eps and min_samples
# clusters = db.labels_