Step 1: Extract Audio Files from Video Clips in another folder: Audio

In [1]:
from moviepy.editor import VideoFileClip
import os

def extract_audio_from_folder(input_folder, output_folder):
    # Create the output folder if it doesn't exist
    os.makedirs(output_folder, exist_ok=True)

    # Process each video file in the input folder
    for video_file in os.listdir(input_folder):
        if video_file.endswith('.mp4'):
            input_video_path = os.path.join(input_folder, video_file)
            output_audio_path = os.path.join(output_folder, f'{os.path.splitext(video_file)[0]}.wav')

            extract_audio(input_video_path, output_audio_path)

def extract_audio(input_video_path, output_audio_path):
    video_clip = VideoFileClip(input_video_path)
    audio_clip = video_clip.audio

    audio_clip.write_audiofile(output_audio_path, codec='pcm_s16le', fps=audio_clip.fps)

    video_clip.close()

# Example usage
input_video_folder = 'Single_Actor_01'
output_audio_folder = 'Single_Audio_01'

extract_audio_from_folder(input_video_folder, output_audio_folder)


MoviePy - Writing audio in Single_Audio_01\01-02-01-01-01-01-01.wav


                                                       

MoviePy - Done.




Step 2: Generate Normalized Spectograms from Audio Files present in folder: Audio

In [2]:
import librosa
import librosa.display
import matplotlib.pyplot as plt
import numpy as np
import os

def generate_spectrogram(audio_file, fft_size=256, hop_size=10, window_size=32, num_parts=6):
    # Load audio file
    y, sr = librosa.load(audio_file, sr=None, duration=4)

    # Calculate the required padding for the spectrogram
    n_fft = fft_size
    hop_length = int(sr * hop_size / 1000)  # Convert hop_size from ms to samples
    win_length = int(sr * window_size / 1000)  # Convert window_size from ms to samples

    # Adjust the n_fft to be at least the length of the signal
    n_fft = max(n_fft, len(y))

    # Compute spectrogram
    spectrogram = librosa.feature.melspectrogram(
        y=y,
        sr=sr,
        n_fft=n_fft,
        hop_length=hop_length,
        win_length=win_length,
        window="hamming",
        n_mels=256  # Number of frequency components
    )

    # Convert to decibels
    spectrogram_db = librosa.power_to_db(spectrogram, ref=np.max)

    # Split spectrogram into N shorter parts
    part_size = spectrogram_db.shape[1] // num_parts
    spectrogram_parts = [spectrogram_db[:, i * part_size:(i + 1) * part_size] for i in range(num_parts)]

    return spectrogram_parts

def normalize_sequences(sequences):
    # Flatten the sequences to compute mean and variance
    flat_sequences = np.concatenate(sequences, axis=1)

    # Compute mean and variance
    mean = np.mean(flat_sequences, axis=1, keepdims=True)
    std = np.std(flat_sequences, axis=1, keepdims=True)

    # Normalize sequences
    normalized_sequences = [(seq - mean) / std for seq in sequences]

    return normalized_sequences
    
def save_normalized_spectrogram_images(audio_file_path, audio_file,normalized_parts, output_folder):
    os.makedirs(output_folder, exist_ok=True)
    y, sr = librosa.load(audio_file_path, sr=None, duration=4)
    hop_length = int(sr * 10 / 1000)
    
    for i, part in enumerate(normalized_parts):
        # Plot the normalized spectrogram without labels
        plt.figure(figsize=(6, 4))
        librosa.display.specshow(part, sr=sr, hop_length=hop_length, x_axis=None, y_axis=None)
        plt.axis('off')

        # Save the image
        image_path = os.path.join(output_folder, f'{os.path.splitext(audio_file)[0]}-0{i+1}.png')
        plt.savefig(image_path, bbox_inches='tight', pad_inches=0)
        plt.close()

def save_normalized_spectrogram_images_from_folder(input_folder, output_folder):
    os.makedirs(output_folder, exist_ok=True)

    # Process each audio file in the input folder
    for audio_file in os.listdir(input_folder):
        if audio_file.endswith('.wav'):
            audio_file_path = os.path.join(input_folder, audio_file)
            spectrogram_parts = generate_spectrogram(audio_file_path)
            normalized_parts = normalize_sequences(spectrogram_parts)
            save_normalized_spectrogram_images(audio_file_path, audio_file, normalized_parts, output_folder)
            

# Example usage
input_audio_folder = 'Single_Audio_01'
output_spectrogram_folder = 'Single_Spectogram_01'

save_normalized_spectrogram_images_from_folder(input_audio_folder, output_spectrogram_folder)

Step 3: Get the output from resnet-18 and also apply spatial pooling afterwords

In [4]:
import torch
import torchvision.transforms as transforms
from PIL import Image
import torch.nn as nn
import os
import warnings

# Suppress all warnings
warnings.filterwarnings("ignore")
resnet18 = torch.hub.load('pytorch/vision:v0.10.0', 'resnet18', pretrained=True)
warnings.resetwarnings()
# Remove global average pooling layer
resnet18 = nn.Sequential(*list(resnet18.children())[:-1])

# Add spatial average pooling layer
resnet18.add_module('avgpool', nn.AdaptiveAvgPool2d(1))

resnet18.eval()


# Define a transformation to preprocess the input image
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
     transforms.Lambda(lambda x: x[:3, :, :]),  # Remove alpha channel if present
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# Folder containing images
image_folder = 'Spectogram_01'

# List to store individual outputs
All_audio_outputs = []

# Process each image in the folder
for filename in os.listdir(image_folder):
    if filename.endswith('.png'):
        image_path = os.path.join(image_folder, filename)
        
        # Load and preprocess the image
        image = Image.open(image_path)
        input_tensor = transform(image)
        input_batch = input_tensor.unsqueeze(0)  # Add batch dimension

        # Perform inference
        with torch.no_grad():
            audio_output = resnet18(input_batch)
        
        # Append the output to the list
        All_audio_outputs.append(audio_output)

print(All_audio_outputs[0].shape)

Using cache found in C:\Users\User/.cache\torch\hub\pytorch_vision_v0.10.0


torch.Size([1, 512, 1, 1])
