In [11]:
import os
import pickle
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA, IncrementalPCA
import pandas as pd
from utils import  find_repo_root
import gc
from tqdm import tqdm
import shutil

Set training_end_id to the number of samples in your training set.

In [2]:
print(find_repo_root())

None


In [3]:
os.getcwd()

'/work'

In [4]:
# Step 1: load feature maps per frame
def load_and_combine_tensors(stage_name, input_folder, num_videos):
    combined_tensor = []
    video_indices = {}

    for video_id in tqdm(range(1, num_videos + 1), desc="Loading feature maps"):
        filename = f"{str(video_id).zfill(4)}_{stage_name}.pkl"
        file_path = os.path.join(input_folder, stage_name, filename)

        if os.path.exists(file_path):
            #print(f"Loading tensor from: {file_path}")
            with open(file_path, 'rb') as file:
                tensor = pickle.load(file)
                combined_tensor.append(tensor)
                # Track start and end indices for each video
                end_index = sum(t.shape[0] for t in combined_tensor)
                video_indices[str(video_id).zfill(4)] = (end_index - tensor.shape[0], end_index)

    if not combined_tensor:
        print("No tensors found to combine.")
        return None, None

    combined_tensor = np.concatenate(combined_tensor, axis=0)
    return combined_tensor, video_indices


In [5]:
# Step 2: globalized standardization (only based on training set)
def flatten_tensors(combined_tensor, num_videos, frames_per_video = 30):
    # flatten the feature maps per video
    flattened_arrays = []
    for i in tqdm(range(num_videos), desc="Flattening feature maps"):
        start_idx = i * frames_per_video
        end_idx = start_idx + frames_per_video
        video = combined_tensor[start_idx:end_idx]
    
        # Flatten the video and append to the list
        flattened_arrays.append(video.flatten())
    
    # Convert the list of flattened arrays into a NumPy array
    x = np.array(flattened_arrays)
    
    return x

def standardize_tensors(combined_tensor, training_end_id=800):
    reshaped_tensor = flatten_tensors(combined_tensor, num_videos) # combined_tensor.reshape(combined_tensor.shape[0], -1)
    scaler = StandardScaler()

    print("Standardizing feature maps")
    # Fit the scaler only on the training set
    scaler.fit(reshaped_tensor[:training_end_id, :])

    # Transform both training and test sets
    standardized_data = scaler.transform(reshaped_tensor)
    print("Feature maps standardized")
    return standardized_data

In [6]:
# Step 3: Fit PCA on train, apply to train and val, save PCs, save metadata
def apply_pca_and_save(standardized_tensor, stage_name, output_folder, training_end_id=800, n_components=400, seed=42):
    # Fit PCA on train, apply to train and val
    print(f"Performing PCA on {stage_name} data...")
    pca = PCA(n_components=n_components,random_state=seed)
    pca.fit(standardized_tensor[:training_end_id, :])
    pca_tensor = pca.transform(standardized_tensor)
    
    # save PCs
    pca_folder = os.path.join(output_folder, 'PCA_dataset', stage_name)
    if not os.path.exists(pca_folder):
        os.makedirs(pca_folder)
    
    pca_filename = os.path.join(pca_folder, f"{stage_name}_pca")
    np.save(pca_filename,pca_tensor)
    print(f"PCs of {stage_name} data saved.")
    
    # get metadata
    # Debugging: Check shapes and variance
    # print("PCA Tensor Shape:", pca_tensor.shape)
    # print("Explained Variance:", np.sum(pca.explained_variance_ratio_))
    pca_tensor_shape = pca_tensor.shape
    variance = np.sum(pca.explained_variance_ratio_)
    return {
        'stage': stage_name,
        'pca_shape': pca_tensor_shape,
        'variance_captured': variance
    }

In [7]:
# Loop through all stages and perform PCA
stages = ["stage_1", "stage_3", "stage_4", "stage_5", "final"] # non functioning stages: stage_2, 
input_folder = 'preprocessed_videos_30frames'
output_folder = "/work"

metadata = []
for stage_name in stages:
    
    stage_folder = os.path.join(os.getcwd(), input_folder, stage_name)
    num_videos = len([f for f in os.listdir(stage_folder) if os.path.isfile(os.path.join(stage_folder, f))])
    
    # Step 1: Load and combine the feature maps for each frame
    combined_tensor, video_indices = load_and_combine_tensors(stage_name, input_folder, num_videos)
    print(combined_tensor.shape)
    
    # Step 2: flatten feature maps per video and standardize values on training set, apply to train and val
    standardized_tensor = standardize_tensors(combined_tensor)
    print(standardized_tensor.shape)

    del combined_tensor
    gc.collect()
    
    # Step 3: Fit PCA on train, apply to train and val, save PCs, save metadata
    stage_metadata = apply_pca_and_save(standardized_tensor, stage_name, output_folder)
    metadata_df = pd.DataFrame(metadata)
    print(metadata_df)
    metadata_filename = os.path.join(stage_folder, f"{stage_name}_metadata.csv")
    metadata_df.to_csv(metadata_filename, index=False)

    del standardized_tensor
    gc.collect()

Loading feature maps: 100%|██████████| 1000/1000 [00:23<00:00, 42.05it/s]


(30000, 1, 28, 28, 512)


Flattening feature maps: 100%|██████████| 1000/1000 [00:16<00:00, 59.61it/s]


Standardizing feature maps
Feature maps standardized
(1000, 12042240)
Performing PCA on stage_3 data...
PCs of stage_3 data saved.
     stage    pca_shape  variance_captured
0  stage_3  (1000, 400)           0.632354


Loading feature maps: 100%|██████████| 1000/1000 [00:13<00:00, 74.88it/s]


(30000, 1, 14, 14, 1024)


Flattening feature maps: 100%|██████████| 1000/1000 [00:09<00:00, 109.76it/s]


Standardizing feature maps
Feature maps standardized
(1000, 6021120)
Performing PCA on stage_4 data...
PCs of stage_4 data saved.
     stage    pca_shape  variance_captured
0  stage_3  (1000, 400)           0.632354
1  stage_4  (1000, 400)           0.629889


Loading feature maps: 100%|██████████| 1000/1000 [00:07<00:00, 142.22it/s]


(30000, 1, 7, 7, 2048)


Flattening feature maps: 100%|██████████| 1000/1000 [00:04<00:00, 200.72it/s]


Standardizing feature maps
Feature maps standardized
(1000, 3010560)
Performing PCA on stage_5 data...
PCs of stage_5 data saved.
     stage    pca_shape  variance_captured
0  stage_3  (1000, 400)           0.632354
1  stage_4  (1000, 400)           0.629889
2  stage_5  (1000, 400)           0.665212


Loading feature maps: 100%|██████████| 1000/1000 [00:00<00:00, 1340.99it/s]


(30000, 1, 1000)


Flattening feature maps: 100%|██████████| 1000/1000 [00:00<00:00, 12205.62it/s]


Standardizing feature maps
Feature maps standardized
(1000, 30000)
Performing PCA on final data...
PCs of final data saved.
     stage    pca_shape  variance_captured
0  stage_3  (1000, 400)           0.632354
1  stage_4  (1000, 400)           0.629889
2  stage_5  (1000, 400)           0.665212
3    final  (1000, 400)           0.839756


In [9]:
####################--------------------------------------------------------------------------------------#################

In [10]:
# zip pca folder
directory_to_zip = "PCA_dataset"  # Replace with your directory name
output_filename = "PCA_dataset"  # Replace with your desired output name
output_path = os.path.join(os.getcwd(), output_filename)
shutil.make_archive(output_path, 'zip', directory_to_zip)

NameError: name 'shutil' is not defined

In [None]:
# batch standardization for stage_2



In [None]:
# incremental pca for stage_2