In [None]:
import os, warnings
import cv2
import shutil
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from decord import VideoReader
from moviepy.editor import AudioFileClip
import librosa
from scipy.io import wavfile
from PIL import Image

input_size = 224
num_frame = 8
sampling_rate = 6

def normalize_audio(audio):
    return audio / np.max(np.abs(audio))

def mel_spectrogram(signal, sample_rate, n_mels=128, fmax=8000):
    mel_spec = librosa.feature.melspectrogram(y=signal, sr=sample_rate, n_mels=n_mels, fmax=fmax)
    return librosa.power_to_db(mel_spec, ref=np.max)

def read_video(file_path):
    vr = VideoReader(file_path)
    frames = vr.get_batch(range(len(vr))).asnumpy()
    return format_frames(frames, output_size=(input_size, input_size))

def format_frames(frame, output_size):
    frame = tf.image.convert_image_dtype(frame, tf.uint8)
    frame = tf.image.resize(frame, size=list(output_size))
    return frame

def uniform_temporal_subsample(x, num_samples, clip_idx, total_clips, frame_rate=1, temporal_dim=-4):
    t = tf.cast(tf.shape(x)[temporal_dim], tf.float32)  # Ensure t is float32
    max_offset = t - num_samples * frame_rate
    step = max_offset // total_clips
    offset = clip_idx * step
    indices = tf.linspace(offset, offset + (num_samples - 1) * frame_rate, num_samples)
    indices = tf.clip_by_value(indices, 0, t - 1)  # Use t as float32
    indices = tf.cast(tf.round(indices), tf.int32)
    return tf.gather(x, indices, axis=temporal_dim)

def clip_generator(image, num_frames=32, frame_rate=1, num_clips=1, crop_size=224):
    clips_list = []
    for i in range(num_clips):
        frame = uniform_temporal_subsample(image, num_frames, i, num_clips, frame_rate=frame_rate, temporal_dim=0)
        clips_list.append(frame)
    return tf.reshape(tf.stack(clips_list), [num_clips*num_frames, crop_size, crop_size, 3])

def video_audio(path, save_path):
    os.makedirs(save_path, exist_ok=True)

    for video_file in os.listdir(path):
        video_path = os.path.join(path, video_file)
        video_name = os.path.splitext(video_file)[0]
        path_video_save = os.path.join(save_path, f"{video_name}.mp4")

        fourcc = cv2.VideoWriter_fourcc(*'mp4v')
        output_video = cv2.VideoWriter(path_video_save, fourcc, 16.0, (224, 224))

        video_ds = read_video(video_path)
        video_ds = clip_generator(video_ds, num_frame, sampling_rate, num_clips=1)

        audio_clip = AudioFileClip(video_path)
        wave_name = os.path.join(save_path, f"{video_name}.wav")
        audio_clip.write_audiofile(wave_name)
        fs, audio_data = wavfile.read(wave_name)
        audio_data = normalize_audio(audio_data)
        step = fs // 16

        for i in range(8):
            signal = audio_data[i * step:(i + 1) * step]
            mel_spec = mel_spectrogram(signal, fs)

            fig, ax = plt.subplots(figsize=(4, 4))
            ax.imshow(mel_spec, aspect='auto', origin='lower', cmap='magma')
            plt.axis('off')
            fig.canvas.draw()
            
            audio_img = np.frombuffer(fig.canvas.tostring_rgb(), dtype=np.uint8)
            audio_img = audio_img.reshape(fig.canvas.get_width_height()[::-1] + (3,))
            audio_img = cv2.resize(audio_img, (224, 224))
            plt.close(fig)
            
            output_video.write(audio_img)

        for i in range(8):
            video_img = video_ds[i].numpy().astype('uint8')
            output_video.write(video_img)

        output_video.release()
        #os.remove(wave_name)  # Remove the temporary WAV file after processing

#if __name__ == '__main__':
    #input_path = r"D:\eNTERFACE\all_avi_files"  # Path containing the 1263 .avi files
    #output_path = r"D:\eNTERFACE\melspect_videoframes"       # Output path to save only .mp4 files

    #video_audio(input_path, output_path)
