In [1]:
# install necessary packages
!pip install opencv-python
!pip install pytube
!pip install pydub

Collecting pytube
  Downloading pytube-11.0.1-py3-none-any.whl (56 kB)
[K     |████████████████████████████████| 56 kB 4.0 MB/s 
[?25hInstalling collected packages: pytube
Successfully installed pytube-11.0.1
Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Installing collected packages: pydub
Successfully installed pydub-0.25.1


In [3]:
# upgrade pytube to obtain the data from YouTube
!pip install --upgrade pytube



In [4]:
import cv2
from pytube import YouTube
import os
import moviepy
from moviepy.video.io.ffmpeg_tools import ffmpeg_extract_subclip
from moviepy.video.io.VideoFileClip import VideoFileClip
from moviepy.audio.io.AudioFileClip import AudioFileClip
import time
import pandas as pd
import numpy as np
import tempfile
from tempfile import TemporaryFile
import pydub
from pydub import AudioSegment
import tensorflow as tf
from tensorflow.keras import layers, models, Sequential
import scipy
from scipy import signal
from scipy.io import wavfile
from keras.layers import Conv1D, MaxPooling1D

We will use this dataset (test one) to collect the random videos from YouTube. 

https://looking-to-listen.github.io/avspeech/download.html

In [5]:
# put the avspeech_test.csv file to Files
dataset = pd.read_csv('/content/avspeech_test.csv')

In [6]:
# change the name of the columns for convenience
dataset = dataset.rename(columns={"u5MPyrRJPmc": "Youtube ID", "108.240000": "start segment",
                                  "111.240000": "end segment", "0.849219": "X coordinate",
                                  "0.305556": "Y coordinate",})

In [7]:
# split the data by 1000
newdata = dataset[:1000]

In [10]:
# create directory to save the video data temporarily and split the video data
# into sound and video without sound 
directory = "dataset"
parent_dir = "/content/"
path = os.path.join(parent_dir, directory) 
try:
  os.mkdir(path)
  print("Directory '% s' created" % directory) 
except:
  print("Directory '% s' exists" % directory)
parent_dir = "/content/dataset/"
# create subdirectories inside the "/content/dataset/"
image_path = os.path.join(parent_dir, "image")
audio_path = os.path.join(parent_dir, "audio")
try:
  os.mkdir(image_path)
  os.mkdir(audio_path)
except:
  print("at least one of train and test directories already existed")

Directory 'dataset' exists


In [11]:
# for store numpy arrays of image and audio
image_data = []
audio_data = []

In [12]:
# for clean up the directory specified by the path
def cleanup(path):
  for f in os.listdir(path):
    try:
      os.remove(os.path.join(path, f))
    except:
      print(f + " : unable to remove")

In [13]:
# convert wav file into log_spectogram
def log_spectogram(wav_path):
  sf, audio = wavfile.read(wav_path)
  sig = np.mean(audio, axis = 1)
  f, t, Sxx = signal.spectrogram(sig, sf)
  Sxx = np.add(Sxx, 1)
  Sxx = np.log10(Sxx)
  return Sxx

In [14]:
# convert log_spectogram (2D array, which represents time and frequency), 
# into 1D array which size is (1, 30)
def audio_conv(audio_log):
  audio_log = np.reshape(audio_log, (1, 129, 196))
  audio = Conv1D(filters = 1, kernel_size = 30, padding = 'same',
                activation = 'relu')(audio_log)
  audio = np.resize(audio_log, (1, 120, 1))
  audio = MaxPooling1D(pool_size=4, padding='same')(audio)    
  audio = Conv1D(filters = 1, kernel_size = 3, padding = 'same',
                activation = 'relu')(audio)
  audio = np.resize(audio_log, (1, 30))
  return audio

In [15]:
# convert mp4 file to numpy array
# the size of the numpy array for each image is 
# (time(30), height(720), width(1280), channels(3))
def video_processing(mp4_file):
  frames = []
  cap = cv2.VideoCapture(mp4_file)
  ret = True
  while ret:
    ret, img = cap.read()
    if ret:
      frames.append(img)
  video = np.stack(frames, axis=0)
  video = video.astype('float32')
  return video

In [16]:
# convert nparray to (30, 90, 160) array using conv architecture
def image_conv(video):
  rgb_weights = [0.2989, 0.5870, 0.1140]
  # convert image into gray scale for reducing a dimension.
  video = np.dot(video[...,:3], rgb_weights)
  video = np.resize(video, (30, 720, 1280, 1))
  # compress the image by using maxpooling.
  video = layers.MaxPool2D()(video)
  video = layers.MaxPool2D()(video)
  video = layers.MaxPool2D()(video)
  video = np.resize(video, (30, 90, 160))
  return video

In [None]:
counter = 0
for row in newdata.iterrows():
  ID = row[1][0]
  start = int(row[1][1]) + 1
  end = int(row[1][2])
  path = "https://www.youtube.com/watch?v=" + ID
  try:
    yt = YouTube(path)
    # yt.streams do not work (11.23.2021)
    audio = yt.streams.filter(only_audio=True)[0].download('/content/dataset/audio/', filename=ID)
    image = yt.streams.filter(only_video=True)[0].download('/content/dataset/image/', filename=ID)
    audio_inputpath = '/content/dataset/audio/' + ID
    image_inputpath = '/content/dataset/image/' + ID
    audio_outputpath = '/content/dataset/audio/' + ID + ' ' + str(start) + 'sec' + '.wav'
    # for each audio and video, we will get one second of audio without sound 
    # and one second of sound.
    # after we extract the audio and video from each clip, append it into 
    # image_data and audio_data with numpy format.
    with AudioFileClip(audio_inputpath) as audio:
      clip = audio.subclip(start, start + 1)
      clip.write_audiofile(audio_outputpath)
      spectogram = log_spectogram(audio_outputpath)
      audio_data.append(audio_conv(spectogram))
    image_outputpath = '/content/dataset/image/' + ID + ' ' + str(start) + 'sec' + '.mp4'
    with VideoFileClip(image_inputpath) as video:
      clip = video.subclip(start, start + 1)
      clip.write_videofile(image_outputpath, audio_codec='aac')
      processed = video_processing(image_outputpath)
      image_data.append(image_conv(processed))  
    cleanup('/content/dataset/audio')
    cleanup('/content/dataset/image')
  except:
    message = ID + " : video not found."
    print(message)

[MoviePy] Writing audio in /content/dataset/audio/H1ulMfj5wRY 113sec.wav


100%|██████████| 23/23 [00:00<00:00, 177.68it/s]

[MoviePy] Done.





[MoviePy] >>>> Building video /content/dataset/image/H1ulMfj5wRY 113sec.mp4
[MoviePy] Writing video /content/dataset/image/H1ulMfj5wRY 113sec.mp4


 98%|█████████▊| 50/51 [00:01<00:00, 43.70it/s]


H1ulMfj5wRY : video not found.
-wuxbgMRIWs : video not found.
[MoviePy] Writing audio in /content/dataset/audio/GNRPRH-E-sI 31sec.wav


100%|██████████| 23/23 [00:00<00:00, 180.08it/s]

[MoviePy] Done.





[MoviePy] >>>> Building video /content/dataset/image/GNRPRH-E-sI 31sec.mp4
[MoviePy] Writing video /content/dataset/image/GNRPRH-E-sI 31sec.mp4


100%|██████████| 30/30 [00:00<00:00, 57.63it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: /content/dataset/image/GNRPRH-E-sI 31sec.mp4 



In [None]:
# save numpy arrays as npy format
np.save('/content/dataset/audio/audio.npy', np.array(audio_data))
np.save('/content/dataset/image/image.npy', np.array(image_data))

In [None]:
# convert dataset folder into zip file
!zip -r /content/dataset.zip /content/dataset

  adding: content/dataset/ (stored 0%)
  adding: content/dataset/audio/ (stored 0%)
  adding: content/dataset/audio/audio.npy (deflated 47%)
  adding: content/dataset/image/ (stored 0%)
  adding: content/dataset/image/image.npy (deflated 47%)
