In [2]:
import torch
import numpy as np
import pickle
from transformers import AutoProcessor, HubertModel
# from transformers import AutoFeatureExtractor, AutoImageProcessor, AutoModel
# from PIL import Image
# from matplotlib import pyplot as plt
# from sklearn.decomposition import PCA
# from sklearn.manifold import TSNE

In [5]:
with open('../IEMOCAP/data_collected.pickle', 'rb') as file:
    dataset = pickle.load(file)
audio_frames = []
for data in dataset[:10]:
    audio_path = data["audio"]["audio_path"]
    audio_data = np.load(audio_path)
    print(audio_data.shape)
    audio_frames.append(audio_data)


(96480,)
(70645,)
(35520,)
(29880,)
(100960,)
(28320,)
(210321,)
(72971,)
(105440,)
(147840,)


In [10]:
class AudioEncoder:
    def __init__(self, model_name="facebook/hubert-large-ls960-ft"):
        self.model = HubertModel.from_pretrained(model_name)
        self.processor = AutoProcessor.from_pretrained(model_name)

    def encode(self, audio_data):
        inputs = self.processor(audio_data, return_tensors="pt", sampling_rate=16000, padding="longest")
        with torch.no_grad():
            outputs = self.model(**inputs)
        return outputs.last_hidden_state

In [11]:
audio_encoder = AudioEncoder()

Some weights of the model checkpoint at facebook/hubert-large-ls960-ft were not used when initializing HubertModel: ['lm_head.bias', 'lm_head.weight']
- This IS expected if you are initializing HubertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing HubertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [12]:
audio_features = []
for audio in audio_frames:
    audio = torch.tensor(audio).cuda()
    audio_features.append(audio_encoder.encode(audio))
audio_features[0].shape

torch.Size([1, 301, 1024])

In [15]:
audio_test = audio_features[0].mean(dim=(0,1))
audio_test.shape

torch.Size([1024])

In [105]:
feature_location = '../IEMOCAP/maeV_features/Ses01F_impro02_M016.npy'
feature2_location2 = '../../Emotion-LLaMA/MERR/maeV_399_UTT/sample_00000005.npy'
feature1 = np.load(feature_location)
feature2 = np.load(feature2_location2)

In [107]:
def numpy_to_tensor(data):
    # Ensure we have exactly 16 frames by sampling or padding
    target_frames = 16
    total_frames = data.shape[0]
    
    if total_frames >= target_frames:
        # Sample frames evenly
        indices = np.linspace(0, total_frames-1, target_frames, dtype=int)
        data = data[indices]
    else:
        # Pad with zeros if we have fewer frames
        padding = np.zeros((target_frames - total_frames, *data.shape[1:]), dtype=data.dtype)
        data = np.concatenate([data, padding], axis=0)
    
    # Convert to tensor and resize to 224x224
    tensor_frames = torch.from_numpy(data).float()
    tensor_frames = tensor_frames / 255.0  # Normalize to [0, 1]
    if torch.cuda.is_available():
        tensor_frames = tensor_frames.cuda()
    
    # Reshape to [B, C, T, H, W] format and resize
    tensor_frames = tensor_frames.permute(0, 3, 1, 2)  # [B, C, H, W]
    tensor_frames = torch.nn.functional.interpolate(
        tensor_frames,
        size=(224, 224),
        mode='bilinear',
        align_corners=False
    )
    
    return list(tensor_frames)

## MAE Encoder

In [108]:
class VideoLocalEncoder:
    def __init__(self, pretrained_model="facebook/vit-mae-large"):
        self.image_processor = AutoImageProcessor.from_pretrained(pretrained_model)
        self.model = AutoModel.from_pretrained(pretrained_model)
    @torch.no_grad()
    def encode_video(self, video):
        inputs = self.image_processor(video, return_tensors="pt")
        outputs = self.model(**inputs)
        return outputs.last_hidden_state

In [109]:
data_location = '../IEMOCAP/processed_videoframes/Ses01F_impro01_F005.npy' 
data = np.load(data_location)

In [110]:
data = numpy_to_tensor(data)
len(data)

16

In [None]:
mae_encoder = VideoLocalEncoder()
feature = mae_encoder.encode_video(data)

In [22]:
feature.shape

torch.Size([16, 50, 1024])

In [114]:
n_feature = feature.mean(dim=(0,1))
n_feature.shape
n_feature = n_feature.cpu().numpy()
n_feature.shape

(1024,)

## VideoMAE encoder

In [94]:
class VideoLocalEncoder:
    def __init__(self, pretrained_model="MCG-NJU/videomae-base-finetuned-kinetics"):
        self.feature_extractor = AutoFeatureExtractor.from_pretrained(pretrained_model)
        self.model = AutoModel.from_pretrained(pretrained_model)
    @torch.no_grad()
    def encode_video(self, video):
        inputs = self.feature_extractor(video, return_tensors="pt")
        outputs = self.model(**inputs)
        return outputs.last_hidden_state

In [96]:
videoEncoder = VideoLocalEncoder()

Some weights of the model checkpoint at MCG-NJU/videomae-base-finetuned-kinetics were not used when initializing VideoMAEModel: ['classifier.bias', 'classifier.weight', 'fc_norm.bias', 'fc_norm.weight']
- This IS expected if you are initializing VideoMAEModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing VideoMAEModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [109]:
tensor_data = numpy_to_tensor(data)
# video = list(np.random.uniform(0,1,size=(16,3,224,224)))
encoded_features = videoEncoder.encode_video(tensor_data)
encoded_features.shape

torch.Size([1, 1568, 768])