In [1]:
!pip install librosa opencv-python scipy tensorflow tqdm audioread gdown


Collecting pytube3
  Downloading pytube3-9.6.4-py3-none-any.whl (38 kB)
Installing collected packages: pytube3
Successfully installed pytube3-9.6.4



### Downloading Videos from Google Drive



In [2]:
import gdown

def download_video(url, path):

    file_id_video = url.split('/')[5]
    download_url_video = f'https://drive.google.com/uc?id={file_id_video}'
    gdown.download(download_url_video, path, quiet=False)

    print(f"Video downloaded to: {path}")

In [3]:

sharing_link_video1 = 'https://drive.google.com/file/d/1vrz_EJ3D1j6yYi5NnlTYssfGoyq1vqnw/view?usp=drive_link'
sharing_link_video2 = 'https://drive.google.com/file/d/1sb5ODQcGu-N3f_6XllVTBAsMzRaMGWEn/view?usp=drive_link'

path = '/content/'
download_video(sharing_link_video1, path)
download_video(sharing_link_video2, path)


Downloading...
From: https://drive.google.com/uc?id=1vrz_EJ3D1j6yYi5NnlTYssfGoyq1vqnw
To: /content/cut-part .mp4
100%|██████████| 3.81M/3.81M [00:00<00:00, 156MB/s]


Video downloaded to: /content/


Downloading...
From: https://drive.google.com/uc?id=1sb5ODQcGu-N3f_6XllVTBAsMzRaMGWEn
To: /content/main .mp4
100%|██████████| 11.8M/11.8M [00:00<00:00, 80.2MB/s]

Video downloaded to: /content/





In [23]:
import librosa
import audioread
import cv2
import numpy as np
from scipy.spatial.distance import cosine
from tensorflow.keras.preprocessing import image
from tqdm import tqdm

from tensorflow.keras.applications import ResNet50
from tensorflow.keras.applications.resnet50 import preprocess_input as preprocess_resnet

from tensorflow.keras.applications import InceptionV3
from tensorflow.keras.applications.inception_v3 import preprocess_input as preprocess_inception



> Loading video files to extract audio features




In [5]:
def load_audio(video_path):
    with audioread.audio_open(video_path) as input_file:
        sr = input_file.samplerate
        y = np.concatenate([np.frombuffer(buf, dtype=np.int16) for buf in input_file])
        y = librosa.util.buf_to_float(y, n_bytes=2, dtype=np.float32)
    return y, sr

>Extracting audio features from video files using **librosa**. These features include MFCCs, chroma, spectral contrast, tonnetz, mel spectrogram, and zero-crossing rate.



In [6]:
def extract_audio_features(video_path):
    y, sr = load_audio(video_path)
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
    chroma = librosa.feature.chroma_stft(y=y, sr=sr)
    spectral_contrast = librosa.feature.spectral_contrast(y=y, sr=sr)
    tonnetz = librosa.feature.tonnetz(y=librosa.effects.harmonic(y), sr=sr)
    mel_spectrogram = librosa.feature.melspectrogram(y=y, sr=sr)
    zero_crossings = librosa.feature.zero_crossing_rate(y)
    return np.concatenate((mfccs, chroma, spectral_contrast, tonnetz, mel_spectrogram, zero_crossings), axis=0)

> To check if features are similar, I am using the cosine distance. Cosine distance is typically used to measure dissimilarity. Subtracting the mean distance from 1 converts it to a similarity score, where higher values indicate higher similarity.

In [7]:
def calculate_audio_similarity(features1, features2):
    min_length = min(features1.shape[1], features2.shape[1])
    features1 = features1[:, :min_length]
    features2 = features2[:, :min_length]
    distances = [cosine(f1, f2) for f1, f2 in zip(features1.T, features2.T)]
    similarity = 1 - np.mean(distances)
    return similarity * 100


> This function aims to extract visual features from video files using the pre-trained deep-learning model. I am using two different models, Resnet50 and InceptionV3. Additionally, OpenCV to read and extract video frames.

In [26]:
def extract_visual_features(video_path, shape, model, batch_size=32):
    cap = cv2.VideoCapture(video_path)
    features = []
    frames = []
    frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

    for _ in tqdm(range(frame_count), desc="Processing Video Frames"):
        ret, frame = cap.read()
        if ret:
            img = cv2.resize(frame, shape)
            img = image.img_to_array(img)
            # img = preprocess_input(img)
            if model.name == 'resnet50':
              img = preprocess_resnet(img)
            elif model.name == 'inception_v3':
              img = preprocess_inception(img)

            frames.append(img)
            if len(frames) == batch_size:
                batch = np.array(frames)
                batch_features = model.predict(batch)
                features.extend(batch_features)
                frames = []
        else:
            break

    if frames:
        batch = np.array(frames)
        batch_features = model.predict(batch)
        features.extend(batch_features)

    cap.release()
    return np.array(features)




> Here I am calculating the **cosine** distance of the features that I extracted from the previous function. A value closer to 1 means that feature vectors are likely to be similar.

In [9]:
def calculate_visual_similarity(features1, features2):
    min_length = min(len(features1), len(features2))
    features1 = features1[:min_length]
    features2 = features2[:min_length]
    distances = [cosine(f1, f2) for f1, f2 in zip(features1, features2)]
    similarity = 1 - np.mean(distances)
    return similarity * 100

> Calculating the overall similarity by taking the average of the audio similarity and visual similarity scores.

In [10]:
def calculate_overall_similarity(audio_similarity, visual_similarity):
    overall_similarity = (audio_similarity + visual_similarity) / 2
    return overall_similarity

In [30]:
import pandas as pd

video1_path = '/content/cut-part .mp4'
video2_path = '/content/main .mp4'

results = []


> Runding audio similarty functions

In [12]:
print("Extracting audio features for video 1...")
audio_features1 = extract_audio_features(video1_path)
print("Extracting audio features for video 2...")
audio_features2 = extract_audio_features(video2_path)

print("Calculating features distance...")
audio_similarity = calculate_audio_similarity(audio_features1, audio_features2)

print(f"Audio Similarity: {audio_similarity:.2f}%")


Extracting audio features for video 1...
Extracting audio features for video 2...
Calculating features distance...
Audio Similarity: 95.21%


In [13]:
def run_video_similarity_model(model_name, model, shape, audio_similarity):

    print("Extracting visual features for videos...")
    visual_features1 = extract_visual_features(video1_path, shape, model)
    visual_features2 = extract_visual_features(video2_path, shape, model)

    print("Calculating features distance...")
    visual_similarity = calculate_visual_similarity(visual_features1, visual_features2)

    overall_similarity_percentage = calculate_overall_similarity(audio_similarity, visual_similarity)

    print(f"Model : {model_name}")
    print(f"Visual Similarity: {visual_similarity:.2f}%")
    print(f"Overall Similarity: {overall_similarity_percentage:.2f}%")

    return model_name, visual_similarity, overall_similarity_percentage

### Resnet50 model visual similarity Output




In [34]:
input_shape = (224, 224, 3)
shape = (224, 224)
model = ResNet50(weights='imagenet', include_top=False, pooling='avg', input_shape=input_shape)

result = run_video_similarity_model("Resnet50", model, shape, audio_similarity)

results.append(result)

Extracting visual features for videos...


Processing Video Frames:   5%|▍         | 16/343 [00:00<00:02, 159.87it/s]



Processing Video Frames:  17%|█▋        | 57/343 [00:01<00:06, 46.98it/s]



Processing Video Frames:  21%|██        | 72/343 [00:01<00:05, 49.00it/s]



Processing Video Frames:  36%|███▌      | 122/343 [00:01<00:02, 86.89it/s]



Processing Video Frames:  40%|████      | 138/343 [00:02<00:02, 81.27it/s]



Processing Video Frames:  54%|█████▍    | 186/343 [00:02<00:01, 100.10it/s]



Processing Video Frames:  59%|█████▉    | 202/343 [00:02<00:01, 84.55it/s] 



Processing Video Frames:  65%|██████▌   | 224/343 [00:03<00:01, 86.19it/s]



Processing Video Frames:  84%|████████▎ | 287/343 [00:03<00:00, 125.06it/s]



Processing Video Frames:  89%|████████▉ | 305/343 [00:03<00:00, 110.26it/s]



Processing Video Frames: 100%|██████████| 343/343 [00:04<00:00, 83.23it/s]




Processing Video Frames:   2%|▏         | 23/1028 [00:00<00:04, 229.52it/s]



Processing Video Frames:   4%|▍         | 46/1028 [00:00<00:09, 101.72it/s]



Processing Video Frames:   9%|▉         | 92/1028 [00:00<00:07, 127.50it/s]



Processing Video Frames:  11%|█         | 110/1028 [00:01<00:08, 106.67it/s]



Processing Video Frames:  15%|█▍        | 153/1028 [00:01<00:07, 113.59it/s]



Processing Video Frames:  18%|█▊        | 186/1028 [00:01<00:08, 95.62it/s]



Processing Video Frames:  21%|██        | 216/1028 [00:02<00:09, 82.98it/s]



Processing Video Frames:  24%|██▎       | 243/1028 [00:02<00:10, 76.51it/s]



Processing Video Frames:  27%|██▋       | 282/1028 [00:03<00:09, 80.04it/s]



Processing Video Frames:  30%|██▉       | 308/1028 [00:03<00:10, 69.70it/s]



Processing Video Frames:  33%|███▎      | 342/1028 [00:04<00:08, 79.50it/s]



Processing Video Frames:  37%|███▋      | 381/1028 [00:04<00:06, 105.65it/s]



Processing Video Frames:  39%|███▊      | 396/1028 [00:04<00:07, 89.86it/s] 



Processing Video Frames:  43%|████▎     | 438/1028 [00:05<00:07, 78.86it/s]



Processing Video Frames:  46%|████▌     | 473/1028 [00:05<00:05, 97.68it/s]



Processing Video Frames:  47%|████▋     | 486/1028 [00:05<00:06, 83.06it/s]



Processing Video Frames:  52%|█████▏    | 538/1028 [00:06<00:04, 117.44it/s]



Processing Video Frames:  54%|█████▍    | 554/1028 [00:06<00:05, 93.31it/s] 



Processing Video Frames:  59%|█████▉    | 604/1028 [00:06<00:03, 120.57it/s]



Processing Video Frames:  60%|██████    | 620/1028 [00:07<00:04, 101.13it/s]



Processing Video Frames:  64%|██████▍   | 661/1028 [00:07<00:03, 110.64it/s]



Processing Video Frames:  66%|██████▌   | 676/1028 [00:07<00:03, 88.48it/s] 



Processing Video Frames:  71%|███████   | 730/1028 [00:08<00:02, 114.62it/s]



Processing Video Frames:  73%|███████▎  | 746/1028 [00:08<00:02, 101.04it/s]



Processing Video Frames:  77%|███████▋  | 792/1028 [00:08<00:01, 118.52it/s]



Processing Video Frames:  79%|███████▊  | 808/1028 [00:09<00:02, 100.95it/s]



Processing Video Frames:  84%|████████▍ | 862/1028 [00:09<00:01, 121.66it/s]



Processing Video Frames:  86%|████████▌ | 879/1028 [00:09<00:01, 95.61it/s] 



Processing Video Frames:  90%|████████▉ | 922/1028 [00:10<00:00, 111.33it/s]



Processing Video Frames:  91%|█████████ | 938/1028 [00:10<00:00, 95.26it/s] 



Processing Video Frames:  96%|█████████▋| 991/1028 [00:10<00:00, 119.48it/s]



Processing Video Frames:  98%|█████████▊| 1008/1028 [00:11<00:00, 104.10it/s]



Processing Video Frames: 100%|██████████| 1028/1028 [00:11<00:00, 90.87it/s]

Calculating features distance...
Model : Resnet50
Visual Similarity: 63.55%
Overall Similarity: 79.38%





In [35]:
# Save the model
save_dir = '/content/ResNet50'
tf.saved_model.save(model, save_dir)
print(f"Model saved to {save_dir}")

!zip -r /content/ResNet50.zip /content/ResNet50

Model saved to /content/ResNet50
  adding: content/ResNet50/ (stored 0%)
  adding: content/ResNet50/assets/ (stored 0%)
  adding: content/ResNet50/saved_model.pb (deflated 92%)
  adding: content/ResNet50/fingerprint.pb (stored 0%)
  adding: content/ResNet50/variables/ (stored 0%)
  adding: content/ResNet50/variables/variables.data-00000-of-00001 (deflated 7%)
  adding: content/ResNet50/variables/variables.index (deflated 81%)


### InceptionV3 model visual similarity Output


In [32]:
input_shape = (299, 299, 3)
shape = (299, 299)
model = InceptionV3(weights='imagenet', include_top=False, pooling='avg', input_shape=input_shape)

result = run_video_similarity_model("InceptionV3", model, shape, audio_similarity)

results.append(result)

Extracting visual features for videos...


Processing Video Frames:   6%|▌         | 21/343 [00:00<00:03, 96.88it/s]



Processing Video Frames:  17%|█▋        | 57/343 [00:02<00:10, 28.10it/s]



Processing Video Frames:  21%|██        | 71/343 [00:02<00:08, 32.62it/s]



Processing Video Frames:  35%|███▍      | 119/343 [00:02<00:03, 62.86it/s]



Processing Video Frames:  45%|████▌     | 155/343 [00:03<00:02, 80.36it/s]



Processing Video Frames:  50%|████▉     | 170/343 [00:03<00:02, 69.92it/s]



Processing Video Frames:  64%|██████▍   | 220/343 [00:04<00:01, 71.22it/s]



Processing Video Frames:  73%|███████▎  | 250/343 [00:04<00:01, 72.71it/s]



Processing Video Frames:  76%|███████▋  | 262/343 [00:05<00:01, 57.57it/s]



Processing Video Frames:  92%|█████████▏| 316/343 [00:05<00:00, 95.88it/s]



Processing Video Frames: 100%|██████████| 343/343 [00:05<00:00, 58.14it/s]




Processing Video Frames:   2%|▏         | 22/1028 [00:00<00:04, 216.26it/s]



Processing Video Frames:   6%|▌         | 63/1028 [00:00<00:07, 122.62it/s]



Processing Video Frames:   8%|▊         | 79/1028 [00:00<00:10, 88.55it/s] 



Processing Video Frames:  12%|█▏        | 124/1028 [00:01<00:08, 110.26it/s]



Processing Video Frames:  14%|█▎        | 140/1028 [00:01<00:10, 86.20it/s] 



Processing Video Frames:  18%|█▊        | 187/1028 [00:01<00:07, 109.09it/s]



Processing Video Frames:  22%|██▏       | 222/1028 [00:02<00:07, 103.49it/s]



Processing Video Frames:  23%|██▎       | 237/1028 [00:02<00:09, 82.30it/s] 



Processing Video Frames:  27%|██▋       | 281/1028 [00:02<00:07, 101.21it/s]



Processing Video Frames:  29%|██▉       | 296/1028 [00:03<00:08, 83.41it/s] 



Processing Video Frames:  34%|███▍      | 350/1028 [00:03<00:06, 100.55it/s]



Processing Video Frames:  37%|███▋      | 377/1028 [00:04<00:07, 82.67it/s]



Processing Video Frames:  39%|███▉      | 403/1028 [00:04<00:09, 68.62it/s]



Processing Video Frames:  43%|████▎     | 445/1028 [00:05<00:07, 78.11it/s]



Processing Video Frames:  46%|████▌     | 471/1028 [00:05<00:08, 66.90it/s]



Processing Video Frames:  49%|████▉     | 508/1028 [00:06<00:06, 82.03it/s]



Processing Video Frames:  51%|█████     | 522/1028 [00:06<00:07, 69.10it/s]



Processing Video Frames:  55%|█████▌    | 569/1028 [00:06<00:04, 94.57it/s]



Processing Video Frames:  57%|█████▋    | 584/1028 [00:07<00:05, 77.95it/s]



Processing Video Frames:  61%|██████▏   | 630/1028 [00:07<00:04, 97.99it/s]



Processing Video Frames:  65%|██████▍   | 667/1028 [00:08<00:03, 102.63it/s]



Processing Video Frames:  66%|██████▋   | 682/1028 [00:08<00:04, 80.46it/s] 



Processing Video Frames:  71%|███████   | 731/1028 [00:08<00:02, 99.36it/s]



Processing Video Frames:  73%|███████▎  | 746/1028 [00:09<00:03, 80.59it/s]



Processing Video Frames:  77%|███████▋  | 787/1028 [00:09<00:02, 87.88it/s]



Processing Video Frames:  80%|████████  | 823/1028 [00:10<00:02, 96.56it/s]



Processing Video Frames:  81%|████████▏ | 837/1028 [00:10<00:02, 79.84it/s]



Processing Video Frames:  86%|████████▋ | 888/1028 [00:10<00:01, 104.07it/s]



Processing Video Frames:  90%|████████▉ | 925/1028 [00:11<00:01, 102.93it/s]



Processing Video Frames:  91%|█████████▏| 940/1028 [00:11<00:01, 83.62it/s] 



Processing Video Frames:  96%|█████████▌| 985/1028 [00:11<00:00, 98.17it/s]



Processing Video Frames:  97%|█████████▋| 1000/1028 [00:12<00:00, 78.46it/s]



Processing Video Frames: 100%|██████████| 1028/1028 [00:12<00:00, 81.87it/s]

Calculating features distance...
Model : InceptionV3
Visual Similarity: 71.24%
Overall Similarity: 83.22%





> Creating a Panda Dataframe Table for output

In [33]:
df = pd.DataFrame(results, columns=['Model', 'Visual Similarity', 'Overall Similarity'])

print(f"Audio Similarity: {audio_similarity:.2f}%")
print('\n')
print("Model Performance Table for Video Similarity:")
print(df)

Audio Similarity: 95.21%


Model Performance Table for Video Similarity:
         Model  Visual Similarity  Overall Similarity
0     Resnet50          63.547446           79.378380
1  InceptionV3          71.237009           83.223161


In [None]:
# Save the model
save_dir = '/content/inception_v3_model'
tf.saved_model.save(model, save_dir)
print(f"Model saved to {save_dir}")

In [None]:
!zip -r /content/inception_v3_model.zip /content/inception_v3_model
