Before running the file Upload all your data set on your goole drive in a zip format

In [None]:
#Mount our google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# ðŸš€ Step 1: Install gdown (simple and reliable)
!pip install --upgrade gdown

# ðŸš€ Step 2: Download your dataset from Google Drive
import gdown

file_id = "1LUXN1t7PCRCGDmA56W_M4kbpuV5SwCeq"
url = f"https://drive.google.com/uc?id={file_id}"
output = "datasets.zip"

print("ðŸ“¥ Downloading datasets...")
gdown.download(url, output, quiet=False)

# ðŸš€ Step 3: Unzip the dataset
!unzip -q datasets.zip -d ./dataset_extracted
print("âœ… Unzipped to ./dataset_extracted/")


ðŸ“¥ Downloading datasets...


Downloading...
From (original): https://drive.google.com/uc?id=1LUXN1t7PCRCGDmA56W_M4kbpuV5SwCeq
From (redirected): https://drive.google.com/uc?id=1LUXN1t7PCRCGDmA56W_M4kbpuV5SwCeq&confirm=t&uuid=0a502f9f-599e-46bf-aca9-ccd239439029
To: /content/datasets.zip
100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 464M/464M [00:06<00:00, 68.8MB/s]


âœ… Unzipped to ./dataset_extracted/


In [None]:
import glob
import numpy as np
import cv2

# âœ… Correct dataset path
base_path = '/content/dataset_extracted/dataset 1/'

# âœ… Gather video files
real_videos = glob.glob(base_path + 'real/*.mp4')
fake_videos = glob.glob(base_path + 'fake/*.mp4')

# Combine both
video_files = real_videos + fake_videos

frame_counts = []

for video_file in video_files:
    cap = cv2.VideoCapture(video_file)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

    # Skip broken or extremely short videos
    if total_frames < 100:
        continue

    frame_counts.append(total_frames)
    cap.release()

# âœ… Results
print(f"Total valid videos: {len(frame_counts)}")
if len(frame_counts) > 0:
    print(f"Average frames per video: {round(np.mean(frame_counts), 2)}")
    print(f"Min frames: {np.min(frame_counts)}")
    print(f"Max frames: {np.max(frame_counts)}")
else:
    print("No valid videos found. Check your path or frame threshold.")


In [None]:
# ---- Paste this entire cell into Colab and run ----
import os, glob, cv2, face_recognition
from tqdm.autonotebook import tqdm

# paths (adjust only if you want different locations)
SRC_ROOT = '/content/dataset_extracted/dataset 1'   # your dataset path (note the space)
OUT_DIR = '/content/drive/MyDrive/FF_REAL_Face_only_data'  # output on Drive

os.makedirs(OUT_DIR, exist_ok=True)
print("Source root:", SRC_ROOT)
print("Output dir:", OUT_DIR)
print("Output exists:", os.path.exists(OUT_DIR))

# find mp4s (recursive)
video_files = glob.glob(os.path.join(SRC_ROOT, '**', '*.mp4'), recursive=True)
print("MP4s found:", len(video_files))
for p in video_files[:50]:
    print(" -", p)

# frame generator
def frame_extract(path):
    cap = cv2.VideoCapture(path)
    if not cap.isOpened():
        print("[ERROR] cannot open video:", path)
        return
    try:
        while True:
            ret, frame = cap.read()
            if not ret:
                break
            yield frame
    finally:
        cap.release()

# processing function
def create_face_videos(path_list, out_dir, max_frames_per_video=150):
    os.makedirs(out_dir, exist_ok=True)
    print("Processing", len(path_list), "videos ->", out_dir)
    for path in tqdm(path_list):
        name = os.path.basename(path)
        out_path = os.path.join(out_dir, name)

        if os.path.exists(out_path):
            print("Skipping (already exists):", out_path)
            continue

        out = cv2.VideoWriter(out_path, cv2.VideoWriter_fourcc('M','J','P','G'), 30, (112,112))
        frames = []
        written = 0
        read_count = 0

        for idx, frame in enumerate(frame_extract(path)):
            if frame is None:
                continue
            read_count += 1
            if idx > max_frames_per_video:
                break
            frames.append(frame)

            if len(frames) == 4:
                # CPU HOG (no CUDA)
                faces_batch = [face_recognition.face_locations(f, model='hog') for f in frames]

                for i, face_locs in enumerate(faces_batch):
                    if not face_locs:
                        continue
                    top, right, bottom, left = face_locs[0]
                    h, w = frames[i].shape[:2]
                    top, left, bottom, right = max(0, top), max(0, left), min(h, bottom), min(w, right)
                    if bottom - top <= 0 or right - left <= 0:
                        continue
                    try:
                        face_img = frames[i][top:bottom, left:right]
                        face_img = cv2.resize(face_img, (112,112))
                        out.write(face_img)
                        written += 1
                    except Exception as e:
                        print("Crop/resize error for", name, "->", str(e))
                frames = []

        out.release()
        print(f"Processed: {name} | frames read: {read_count} | faces written: {written}")

# Run processing on all found videos
if len(video_files) == 0:
    print("\nNo mp4s found under the path. Please check the path or upload videos into it.")
else:
    create_face_videos(video_files, OUT_DIR)


Source root: /content/dataset_extracted/dataset 1
Output dir: /content/drive/MyDrive/FF_REAL_Face_only_data
Output exists: True
MP4s found: 200
 - /content/dataset_extracted/dataset 1/real/00014.mp4
 - /content/dataset_extracted/dataset 1/real/00000.mp4
 - /content/dataset_extracted/dataset 1/real/00099.mp4
 - /content/dataset_extracted/dataset 1/real/00026.mp4
 - /content/dataset_extracted/dataset 1/real/00038.mp4
 - /content/dataset_extracted/dataset 1/real/00044.mp4
 - /content/dataset_extracted/dataset 1/real/00058.mp4
 - /content/dataset_extracted/dataset 1/real/00001.mp4
 - /content/dataset_extracted/dataset 1/real/00008.mp4
 - /content/dataset_extracted/dataset 1/real/00028.mp4
 - /content/dataset_extracted/dataset 1/real/00037.mp4
 - /content/dataset_extracted/dataset 1/real/00075.mp4
 - /content/dataset_extracted/dataset 1/real/00069.mp4
 - /content/dataset_extracted/dataset 1/real/00093.mp4
 - /content/dataset_extracted/dataset 1/real/00006.mp4
 - /content/dataset_extracted/d

  0%|          | 0/200 [00:00<?, ?it/s]

Processed: 00014.mp4 | frames read: 152 | faces written: 148
Processed: 00000.mp4 | frames read: 152 | faces written: 148
Processed: 00099.mp4 | frames read: 152 | faces written: 148
Processed: 00026.mp4 | frames read: 152 | faces written: 148
Processed: 00038.mp4 | frames read: 152 | faces written: 148
Processed: 00044.mp4 | frames read: 152 | faces written: 148
Processed: 00058.mp4 | frames read: 152 | faces written: 148
Processed: 00001.mp4 | frames read: 152 | faces written: 148
Processed: 00008.mp4 | frames read: 152 | faces written: 148
Processed: 00028.mp4 | frames read: 152 | faces written: 148
Processed: 00037.mp4 | frames read: 152 | faces written: 148
Processed: 00075.mp4 | frames read: 152 | faces written: 148
Processed: 00069.mp4 | frames read: 152 | faces written: 134
Processed: 00093.mp4 | frames read: 152 | faces written: 148
Processed: 00006.mp4 | frames read: 152 | faces written: 148
Processed: 00095.mp4 | frames read: 152 | faces written: 138
Processed: 00034.mp4 | f

In [None]:
import glob

video_files = glob.glob('/content/dataset_extracted/dataset 1/**/*.mp4', recursive=True)
print("Found videos:", len(video_files))
video_files[:10]


Found videos: 200


['/content/dataset_extracted/dataset 1/real/00014.mp4',
 '/content/dataset_extracted/dataset 1/real/00000.mp4',
 '/content/dataset_extracted/dataset 1/real/00099.mp4',
 '/content/dataset_extracted/dataset 1/real/00026.mp4',
 '/content/dataset_extracted/dataset 1/real/00038.mp4',
 '/content/dataset_extracted/dataset 1/real/00044.mp4',
 '/content/dataset_extracted/dataset 1/real/00058.mp4',
 '/content/dataset_extracted/dataset 1/real/00001.mp4',
 '/content/dataset_extracted/dataset 1/real/00008.mp4',
 '/content/dataset_extracted/dataset 1/real/00028.mp4']

In [None]:
create_face_videos(video_files, '/content/drive/MyDrive/FF_REAL_Face_only_data/')


Processing 200 videos -> /content/drive/MyDrive/FF_REAL_Face_only_data/


  0%|          | 0/200 [00:00<?, ?it/s]

Skipping (already exists): /content/drive/MyDrive/FF_REAL_Face_only_data/00014.mp4
Skipping (already exists): /content/drive/MyDrive/FF_REAL_Face_only_data/00000.mp4
Skipping (already exists): /content/drive/MyDrive/FF_REAL_Face_only_data/00099.mp4
Skipping (already exists): /content/drive/MyDrive/FF_REAL_Face_only_data/00026.mp4
Skipping (already exists): /content/drive/MyDrive/FF_REAL_Face_only_data/00038.mp4
Skipping (already exists): /content/drive/MyDrive/FF_REAL_Face_only_data/00044.mp4
Skipping (already exists): /content/drive/MyDrive/FF_REAL_Face_only_data/00058.mp4
Skipping (already exists): /content/drive/MyDrive/FF_REAL_Face_only_data/00001.mp4
Skipping (already exists): /content/drive/MyDrive/FF_REAL_Face_only_data/00008.mp4
Skipping (already exists): /content/drive/MyDrive/FF_REAL_Face_only_data/00028.mp4
Skipping (already exists): /content/drive/MyDrive/FF_REAL_Face_only_data/00037.mp4
Skipping (already exists): /content/drive/MyDrive/FF_REAL_Face_only_data/00075.mp4
Skip